Table of Contents

Introduction

Analysis and data associated with the manuscript: "Regenerative lineages and immune-mediated pruning in lung cancer metastasis", Laughney et al. (Nature Medicine 2019)

We used 10X droplet-based single-cell RNA sequencing (scRNA-seq) and graph-based phenotypic analysis to explore tumour cell heterogeneity through the lens of epithelial development and regeneration in lung adenocarcinoma primary tumors and metastases, to assess parallels between tumor cell plasticity and developmental hierarchies. Using patient primary and metastatic tumors, as well as a mouse model of lung cancer metastasis, our analyses provide evidence of regenerative cell types and lineage promiscuity in untreated primary tumours and phenotypic states spanning key stages of embryonic lung morphogenesis in metastases. This notebook walks the reader through key analyses and illustrates how to query annotated pre- and post-processed data.

For each data subset, we provide:

DFsubset = count matrix (cells x genes)
NDFsubset = library-size normalized count matrix (cells x genes)
INDFsubset = imputed and library-size normalized dataframe (cells x genes)
DIMENSIONSsubset = cells x dimensions (i.e. principal components, diffusion components)
GENE_RANK_MASTsubset = genes x cell group (DEG in cell group compared to all other cells)

Install dependencies

In [1]:
fig_dpi = 300
In [2]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

import time
import os
import numpy as np
import seaborn as sns
import pandas as pd
import re
import h5py
from copy import deepcopy
from colors import rgb, hex
import scipy.cluster.hierarchy as hc

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.patches as patches
import matplotlib.colors as colors
import matplotlib.font_manager
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Comic Sans MS'

from scipy import stats
from scipy.cluster.hierarchy import dendrogram, fcluster, leaves_list, set_link_color_palette

import seqc
import seqc.plot
import seqc.stats

from pylab import *

from scipy.stats import zscore
import cmocean
from scipy.stats import linregress

def gauss(x,mu,sigma,A):
    return A*exp(-(x-mu)**2/2/sigma**2)

def bimodal(x,mu1,sigma1,A1,mu2,sigma2,A2):
    return gauss(x,mu1,sigma1,A1)+gauss(x,mu2,sigma2,A2)

def rgb2hex(r,g,b):
    hex = "#{:02x}{:02x}{:02x}".format(r,g,b)
    return hex

def hex2rgb(hexcode):
    rgb = tuple(map(ord,hexcode[1:].decode('hex')))
    return rgb

import scipy.cluster.hierarchy as sch
import re

_nsre = re.compile('([0-9]+)')
def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split(_nsre, s)] 
findfont: Font family ['Raleway'] not found. Falling back to DejaVu Sans.
findfont: Font family ['Lato'] not found. Falling back to DejaVu Sans.
In [3]:
%matplotlib inline
In [4]:
# H5 WRAPPER
class H5:

    def __init__(self, archive_name: str):
        """Wrapper for the pandas HDFStore class which ensures that all interactions with
        the archive result in a closed, flushed archive.

        In order to ensure data usability, all data must be submitted in DataFrame format.
        This decision was made to encourage users to pair metadata with sequencing data,
        and reduce the incidence of unexpected data permutation.

        :param archive_name: name of the h5 archive to open. If the archive does not exist
          it will be created using a blosc5 filter

        :method ls: list contents of the archive
        :method save: save an object to the h5 archive
        :method load: load an object from the archive
        :method remove: remove a DataFrame from the archive
        :method is_open: returns True if the h5 archive is open, else False
        """
        if os.path.isfile(archive_name):
            self._archive = pd.HDFStore(archive_name, mode='a')
            self._archive.close()
        else:
            self._archive = pd.HDFStore(
                archive_name, mode='a', complib='blosc', complevel=5)
            self._archive.close()

    def __repr__(self):
        self._archive.open()
        try:
            return repr(self._archive)
        finally:
            self._archive.close()

    def save(self, data: pd.DataFrame, location: str) -> None:
        """Save DataFrame data to the h5 archive in location.

        :param data: DataFrame object to store
        :param location: filepath to save the object in the h5 hierarchy
        """
        if not isinstance(data, pd.DataFrame):
            if isinstance(data, np.ndarray):
                res = input('np.ndarray class detected. Save as pd.DataFrame with '
                            'ascending integer indices? [y/n] ')
                if res in ['y', 'yes', 'Y', 'YES', 'True', 'true', '1']:
                    data = pd.DataFrame(data)
                else:
                    print('User elected not to save DataFrame, archive is unmodified.')
                    return
            else:
                raise TypeError('only pd.DataFrame objects can be saved using this '
                                'class. To save np.ndarray objects please see the tables '
                                'package.')
        self._archive.open()
        try:
            self._archive[location] = data
        finally:
            self._archive.close()

    def load(self, location: str) -> None:
        """Load and return the dataframe found at location in the archive.

        :param location: str, location of object to retrieve from h5
        :return: pd.DataFrame, object found at location
        """
        self._archive.open()
        try:
            return self._archive[location]
        finally:
            self._archive.close()

    def ls(self) -> None:
        """list archive contents"""
        try:
            self._archive.open()
            print(self._archive.keys())
        finally:
            self._archive.close()

    def remove(self, location: str) -> None:
        """remove the DataFrame at location from the archive

        Note: removing a dataframe at a branch node will remove all leaves sharing this
        prefix. e.g. in an archive containing:

        /data
        /data/filtered
        /data/metadata
        /new_data/data

        removing /data would remove the first three DataFrame objects from the archive.

        :param location: location of DataFrame to remove
        :return: None
        """

        self._archive.open()
        try:
            if location not in self._archive.keys():
                raise ValueError(
                    '{} not contained in archive, nothing to remove.'.format(location))
            else:
                removed = [k for k in self._archive.keys()
                           if k.startswith(location + '/')]
                if len(removed) != 0:
                    res = input(
                        'Removing branch node {}, which is a prefix for {!a} will remove '
                        'all listed DataFrames. Continue with removal? [y/n] '.format(
                            location, removed))
                    if res not in ['y', 'yes', 'Y', 'YES', 'True', 'true', '1']:
                        print('returned without deletion.')
                        return
                self._archive.remove(location)
        finally:
            self._archive.close()

    @property
    def is_open(self) -> bool:
        return self._archive.is_open
In [5]:
# FUNCTIONS FOR DOT PLOT
def _plot_gene_groups_brackets(gene_groups_ax, group_positions, group_labels,
                               left_adjustment=-0.3, right_adjustment=0.3, rotation=None, orientation='top'):
    """
    Draws brackets that represent groups of genes on the give axis.
    For best results, this axis is located on top of an image whose
    x axis contains gene names.

    The gene_groups_ax should share the x axis with the main ax.

    Eg: gene_groups_ax = fig.add_subplot(axs[0, 0], sharex=dot_ax)

    This function is used by dotplot, heatmap etc.

    Parameters
    ----------
    gene_groups_ax : matplotlib axis
        In this axis the gene marks are drawn
    group_positions : list of `tuples`
        Each item in the list, should contain the start and end position that the
        bracket should cover.
        Eg. [(0, 4), (5, 8)] means that there are two brackets, one for the var_names (eg genes)
        in positions 0-4 and other for positions 5-8
    group_labels :  list
        List of group labels
    left_adjustment : `float`
        adjustment to plot the bracket start slightly before or after the first gene position.
        If the value is negative the start is moved before.
    right_adjustment : `float`
        adjustment to plot the bracket end slightly before or after the last gene position
        If the value is negative the start is moved before.
    rotation : `float` (default None)
        rotation degrees for the labels. If not given, small labels (<4 characters) are not
        rotated, otherwise, they are rotated 90 degrees
    orientation : `str` (default `top`)
        location of the brackets. Either `top` or `right`
    Returns
    -------
    None
    """
    import matplotlib.patches as patches
    from matplotlib.path import Path

    # get the 'brackets' coordinates as lists of start and end positions

    left = [x[0] + left_adjustment for x in group_positions]
    right = [x[1] + right_adjustment for x in group_positions]

    # verts and codes are used by PathPatch to make the brackets
    verts = []
    codes = []
    if orientation == 'top':
        # rotate labels if any of them is longer than 4 characters
        if rotation is None and group_labels is not None and len(group_labels) > 0:
            if max([len(x) for x in group_labels]) > 4:
                rotation = 90
            else:
                rotation = 0
        for idx in range(len(left)):
            verts.append((left[idx], 0))  # lower-left
            verts.append((left[idx], 0.6))  # upper-left
            verts.append((right[idx], 0.6))  # upper-right
            verts.append((right[idx], 0))  # lower-right

            codes.append(Path.MOVETO)
            codes.append(Path.LINETO)
            codes.append(Path.LINETO)
            codes.append(Path.LINETO)

            try:
                group_x_center = left[idx] + float(right[idx] - left[idx]) / 2
                gene_groups_ax.text(group_x_center, 1.1, group_labels[idx], ha='center',
                                    va='bottom', rotation=rotation)
            except:
                pass
    else:
        top = left
        bottom = right
        for idx in range(len(top)):
            verts.append((0, top[idx]))    # upper-left
            verts.append((0.15, top[idx]))  # upper-right
            verts.append((0.15, bottom[idx]))  # lower-right
            verts.append((0, bottom[idx]))  # lower-left

            codes.append(Path.MOVETO)
            codes.append(Path.LINETO)
            codes.append(Path.LINETO)
            codes.append(Path.LINETO)

            try:
                diff = bottom[idx] - top[idx]
                group_y_center = top[idx] + float(diff) / 2
                if diff * 2 < len(group_labels[idx]):
                    # cut label to fit available space
                    group_labels[idx] = group_labels[idx][:int(diff * 2)] + "."
                gene_groups_ax.text(0.6, group_y_center,  group_labels[idx], ha='right',
                                    va='center', rotation=270, fontsize='small')
            except Exception as e:
                print('problems {}'.format(e))
                pass

    path = Path(verts, codes)

    patch = patches.PathPatch(path, facecolor='None', lw=1.5,edgecolor='k')

    gene_groups_ax.add_patch(patch)
    gene_groups_ax.spines['right'].set_visible(False)
    gene_groups_ax.spines['top'].set_visible(False)
    gene_groups_ax.spines['left'].set_visible(False)
    gene_groups_ax.spines['bottom'].set_visible(False)
    gene_groups_ax.grid(False)

    # remove y ticks
    gene_groups_ax.tick_params(axis='y', left=False, labelleft=False)
    # remove x ticks and labels
    gene_groups_ax.tick_params(axis='x', bottom=False, labelbottom=False, labeltop=False)
    #gene_groups_ax.tick_params(direction='out', length=6, width=2, colors='r',
    #           grid_color='r', grid_alpha=0.5)

def dotplot(adata, var_names, categories, groupby=None, use_raw=None, log=False, num_categories=7,
            expression_cutoff=0., mean_only_expressed=False, color_map='Reds', dot_max=None,
            dot_min=None, figsize=None, dendrogram=False, gene_symbols=None,
            var_group_positions=None, standard_scale=None, smallest_dot=0.,
            var_group_labels=None, var_group_rotation=None, layer=None, show=None,
            save=None, **kwds):
    """
    Makes a *dot plot* of the expression values of `var_names`.

    For each var_name and each `groupby` category a dot is plotted. Each dot
    represents two values: mean expression within each category (visualized by
    color) and fraction of cells expressing the var_name in the
    category (visualized by the size of the dot).  If groupby is not given, the
    dotplot assumes that all data belongs to a single category.

    **Note**: A gene is considered expressed if the expression value in the adata
    (or adata.raw) is above the specified threshold which is zero by default.

    An example of dotplot usage is to visualize, for multiple marker genes,
    the mean value and the percentage of cells expressing the gene accross multiple clusters.

    Parameters
    ----------
    {common_plot_args}
    expression_cutoff : `float` (default: `0.`)
        Expression cutoff that is used for binarizing the gene expression and determining the fraction
        of cells expressing given genes. A gene is expressed only if the expression value is greater than
        this threshold.
    mean_only_expressed : `bool` (default: `False`)
        If True, gene expression is averaged only over the cells expressing the given genes.
    color_map : `str`, optional (default: `Reds`)
        String denoting matplotlib color map.
    dot_max : `float` optional (default: `None`)
        If none, the maximum dot size is set to the maximum fraction value found (e.g. 0.6). If given,
        the value should be a number between 0 and 1. All fractions larger than dot_max are clipped to
        this value.
    dot_min : `float` optional (default: `None`)
        If none, the minimum dot size is set to 0. If given,
        the value should be a number between 0 and 1. All fractions smaller than dot_min are clipped to
        this value.
    standard_scale : {{'var', 'group'}}, optional (default: None)
        Whether or not to standardize that dimension between 0 and 1, meaning for each variable or group,
        subtract the minimum and divide each by its maximum.
    smallest_dot : `float` optional (default: 0.)
        If none, the smallest dot has size 0. All expression levels with `dot_min` are potted with
        `smallest_dot` dot size.

    {show_save_ax}
    **kwds : keyword arguments
        Are passed to `matplotlib.pyplot.scatter`.

    Returns
    -------
    List of :class:`~matplotlib.axes.Axes`

    Examples
    -------
    >>> adata = sc.datasets.pbmc68k_reduced()
    >>> sc.pl.dotplot(adata, ['C1QA', 'PSAP', 'CD79A', 'CD79B', 'CST3', 'LYZ'],
    ...               groupby='bulk_labels', dendrogram=True)
    """
    if isinstance(var_names, str):
        var_names = [var_names]
    #categories, obs_tidy = _prepare_dataframe(adata, var_names, groupby, use_raw, log, num_categories,
    #                                          layer=layer, gene_symbols=gene_symbols)
    obs_tidy = adata

    # for if category defined by groupby (if any) compute for each var_name
    # 1. the fraction of cells in the category having a value > expression_cutoff
    # 2. the mean value over the category

    # 1. compute fraction of cells having value > expression_cutoff
    # transform obs_tidy into boolean matrix using the expression_cutoff
    obs_bool = obs_tidy > expression_cutoff

    # compute the sum per group which in the boolean matrix this is the number
    # of values > expression_cutoff, and divide the result by the total number of values
    # in the group (given by `count()`)
    fraction_obs = obs_bool.groupby(level=groupby,axis=0).sum() / obs_bool.groupby(level=groupby,axis=0).count()
    fraction_obs = fraction_obs.ix[categories,:] #re-order by categories

    # 2. compute mean value
    if mean_only_expressed:
        mean_obs = obs_tidy.mask(~obs_bool).groupby(level=groupby,axis=0).mean().fillna(0)
        mean_obs = mean_obs.ix[categories,:] #re-order by categories
    else:
        mean_obs = obs_tidy.groupby(level=groupby,axis=0).mean()

    if standard_scale == 'group':
        mean_obs = mean_obs.sub(mean_obs.min(1), axis=0)
        mean_obs = mean_obs.div(mean_obs.max(1), axis=0).fillna(0)
    elif standard_scale == 'var':
        mean_obs -= mean_obs.min(0)
        mean_obs = (mean_obs / mean_obs.max(0)).fillna(0)
    elif standard_scale is None:
        pass
    else:
        logg.warn('Unknown type for standard_scale, ignored')


    dendro_width = 0.8 if dendrogram else 0
    colorbar_width = 0.2
    colorbar_width_spacer = 0.5
    size_legend_width = 0.25
    if figsize is None:
        height = len(categories) * 0.3 + 1  # +1 for labels
        # if the number of categories is small (eg 1 or 2) use
        # a larger height
        height = max([1.5, height])
        heatmap_width = len(var_names) * 0.35
        width = heatmap_width + colorbar_width + size_legend_width + dendro_width + colorbar_width_spacer
    else:
        width, height = figsize
        heatmap_width = width - (colorbar_width + size_legend_width + dendro_width + colorbar_width_spacer)

    # colorbar ax width should not change with differences in the width of the image
    # otherwise can become too small

    if var_group_positions is not None and len(var_group_positions) > 0:
        # add some space in case 'brackets' want to be plotted on top of the image
        height_ratios = [0.5, 10]
    else:
        height_ratios = [0, 10.5]

    # define a layout of 2 rows x 5 columns
    # first row is for 'brackets' (if no brackets needed, the height of this row is zero)
    # second row is for main content. This second row
    # is divided into 4 axes:
    #   first ax is for the main figure
    #   second ax is for dendrogram (if present)
    #   third ax is for the color bar legend
    #   fourth ax is for an spacer that avoids the ticks
    #             from the color bar to be hidden beneath the size lengend axis
    #   fifth ax is to plot the size legend
    fig = plt.figure(figsize=(width, height))
    axs = gridspec.GridSpec(nrows=2, ncols=5, wspace=0.02, hspace=0.04,
                            width_ratios=[heatmap_width, dendro_width, colorbar_width, colorbar_width_spacer, size_legend_width],
                            height_ratios=height_ratios)
    if len(categories) < 4:
        # when few categories are shown, the colorbar and size legend
        # need to be larger than the main plot, otherwise they would look
        # compressed. For this, the dotplot ax is split into two:
        axs2 = gridspec.GridSpecFromSubplotSpec(2, 1, subplot_spec=axs[1, 0],
                                                height_ratios=[len(categories) * 0.3, 1])
        dot_ax = fig.add_subplot(axs2[0])
    else:
        dot_ax = fig.add_subplot(axs[1, 0])

    color_legend = fig.add_subplot(axs[1, 2])

    # to keep the size_legen of about the same height, irrespective
    # of the number of categories, the fourth ax is subdivided into two parts
    size_legend_height = min(1.3, height)
    # wspace is proportional to the width but a constant value is
    # needed such that the spacing is the same for thinner or wider images.
    wspace = 10.5 / width
    axs3 = gridspec.GridSpecFromSubplotSpec(2, 1, subplot_spec=axs[1, 4], wspace=wspace,
                                            height_ratios=[size_legend_height / height,
                                                           (height - size_legend_height) / height])
    # make scatter plot in which
    # x = var_names
    # y = groupby category
    # size = fraction
    # color = mean expression

    y, x = np.indices(mean_obs.shape)
    y = y.flatten()
    x = x.flatten()
    frac = fraction_obs.values.flatten()
    mean_flat = mean_obs.values.flatten()
    cmap = plt.get_cmap(color_map)
    if dot_max is None:
        dot_max = np.ceil(max(frac) * 10) / 10
    else:
        if dot_max < 0 or dot_max > 1:
            raise ValueError("`dot_max` value has to be between 0 and 1")
    if dot_min is None:
        dot_min = 0
    else:
        if dot_min < 0 or dot_min > 1:
            raise ValueError("`dot_min` value has to be between 0 and 1")

    if dot_min != 0 or dot_max != 1:
        # clip frac between dot_min and  dot_max
        frac = np.clip(frac, dot_min, dot_max)
        old_range = dot_max - dot_min
        # re-scale frac between 0 and 1
        frac = ((frac - dot_min) / old_range)

    size = (frac * 10) ** 2
    size += smallest_dot
    import matplotlib.colors

    normalize = matplotlib.colors.Normalize(vmin=kwds.get('vmin'), vmax=kwds.get('vmax'))
    colors = cmap(normalize(mean_flat))
    dot_ax.scatter(x, y, color=colors, s=size, cmap=cmap, norm=None, edgecolor='none', **kwds)
    y_ticks = range(mean_obs.shape[0])
    dot_ax.set_yticks(y_ticks)
    dot_ax.set_yticklabels([mean_obs.index[idx] for idx in y_ticks])

    x_ticks = range(mean_obs.shape[1])
    dot_ax.set_xticks(x_ticks)
    dot_ax.set_xticklabels([mean_obs.columns[idx] for idx in x_ticks], rotation=90)
    dot_ax.tick_params(axis='both', labelsize='small')
    dot_ax.grid(False)
    dot_ax.set_xlim(-0.5, len(var_names) + 0.5)
    dot_ax.set_ylabel(groupby)

    # to be consistent with the heatmap plot, is better to
    # invert the order of the y-axis, such that the first group is on
    # top
    ymin, ymax = dot_ax.get_ylim()
    dot_ax.set_ylim(ymax+0.5, ymin - 0.5)

    dot_ax.set_xlim(-1, len(var_names))

    # plot group legends on top of dot_ax (if given)
    if var_group_positions is not None and len(var_group_positions) > 0:
        gene_groups_ax = fig.add_subplot(axs[0, 0], sharex=dot_ax)
        _plot_gene_groups_brackets(gene_groups_ax, group_positions=var_group_positions,
                                   group_labels=var_group_labels,
                                   rotation=var_group_rotation,
                                   orientation = 'top')

    # plot colorbar
    import matplotlib.colorbar
    matplotlib.colorbar.ColorbarBase(color_legend, cmap=cmap, norm=normalize)

    # for the dot size legend, use step between dot_max and dot_min
    # based on how different they are.
    diff = dot_max - dot_min
    if 0.3 < diff <= 0.6:
        step = 0.1
    elif diff <= 0.3:
        step = 0.05
    else:
        step = 0.2
    # a descending range that is afterwards inverted is used
    # to guarantee that dot_max is in the legend.
    fracs_legends = np.arange(dot_max, dot_min, step * -1)[::-1]
    if dot_min != 0 or dot_max != 1:
        fracs_values = ((fracs_legends - dot_min) / old_range)
    else:
        fracs_values = fracs_legends
    size = (fracs_values * 10) ** 2
    size += smallest_dot
    color = [cmap(normalize(value)) for value in np.repeat(max(mean_flat) * 0.7, len(size))]

    # plot size bar
    size_legend = fig.add_subplot(axs3[0])

    size_legend.scatter(np.repeat(0, len(size)), range(len(size)), s=size, color=color)
    size_legend.set_yticks(range(len(size)))
    labels = ["{:.0%}".format(x) for x in fracs_legends]
    if dot_max < 1:
        labels[-1] = ">" + labels[-1]
    size_legend.set_yticklabels(labels)
    size_legend.set_yticklabels(["{:.0%}".format(x) for x in fracs_legends])

    size_legend.tick_params(axis='y', left=False, labelleft=False, labelright=True)

    # remove x ticks and labels
    size_legend.tick_params(axis='x', bottom=False, labelbottom=False)

    # remove surrounding lines
    size_legend.spines['right'].set_visible(False)
    size_legend.spines['top'].set_visible(False)
    size_legend.spines['left'].set_visible(False)
    size_legend.spines['bottom'].set_visible(False)
    size_legend.grid(False)

    ymin, ymax = size_legend.get_ylim()
    size_legend.set_ylim(ymin, ymax+0.5)

    #utils.savefig_or_show('dotplot', show=show, save=save)
    return axs

Specify read/write directories

In [6]:
# PATH TO H5 WITH PROCESSED DATAFRAMES
DATA_PATH = './data/'#'/ifs/e63data/massaguelab/ashley/data/sc_RNAseq/h5/processed/Project_AL_H2087_LCC/manuscript/'
FN = 'PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED.h5'

h5_data = H5(DATA_PATH + FN)
h5_data.ls()
['/DF_ALL', '/DF_EPITHELIAL_NOR', '/DF_EPITHELIAL_NOR_TUMOR', '/DF_EPITHELIAL_NOR_TUMOR_MET', '/DF_LYMPHOID_ALL', '/DF_MYELOID_OTHER_ALL', '/DIMENSIONS_ALL', '/DIMENSIONS_EPITHELIAL_NOR', '/DIMENSIONS_EPITHELIAL_NOR_TUMOR', '/DIMENSIONS_EPITHELIAL_NOR_TUMOR_MET', '/DIMENSIONS_LYMPHOID_ALL', '/DIMENSIONS_MYELOID_OTHER_ALL', '/GENE_RANK_MAST_ALL', '/GENE_RANK_MAST_EPITHELIAL_NOR', '/GENE_RANK_MAST_EPITHELIAL_NOR_TUMOR', '/GENE_RANK_MAST_EPITHELIAL_NOR_TUMOR_MET', '/GENE_RANK_MAST_LYMPHOID_ALL', '/GENE_RANK_MAST_MYELOID_OTHER_ALL', '/INDF_ALL', '/INDF_EPITHELIAL_NOR', '/INDF_EPITHELIAL_NOR_TUMOR', '/INDF_EPITHELIAL_NOR_TUMOR_MET', '/INDF_LYMPHOID_ALL', '/INDF_MYELOID_OTHER_ALL', '/NDF_ALL', '/NDF_EPITHELIAL_NOR', '/NDF_EPITHELIAL_NOR_TUMOR', '/NDF_EPITHELIAL_NOR_TUMOR_MET', '/NDF_LYMPHOID_ALL', '/NDF_MYELOID_OTHER_ALL']
In [7]:
# SPECIFY SAVE DIRECTORIES
tag = 'demo' # optional key word to append to directory name
now = time.strftime("%x") 
now = str.replace(now,'/','_')

# SPECIFY OUTPUT STEMS FOR FIGURES/PATHWAY ANALYSIS
FIG_output_stem = "./figures/" + FN.replace('.h5','{}_'.format(tag)) + now +'/'
print(FIG_output_stem)

# CREATE DIRECTORIES IF THEY DO NOT EXIST

# CREATE FIGURE DIRECTORY IF IT DOES NOT EXIST     
d = os.path.dirname(FIG_output_stem)
if not os.path.exists(d):
        os.makedirs(d)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/

Define colormaps

In [8]:
# --- DEFINE ALL COLORMAPS UPFRONT FOR UNIFORMITY ---

# CATEGORICAL: SAMPLE
FLATUI_SAMPLE = [ 
                 '800000', #'RU255B_BRAINMET_4S_TR' --> BRAIN MET (TR) FF4040 REDS
                 '551A8B', #'RU653_TUMOR_1AS_UTR' --> PRIMARY 551A8B purple
                 '0000EE', #'RU661_TUMOR_1AS_UTR' -- PRIMARY 0000EE blue
                 'FF00FF', #'RU666_SPINEMET_4NS_TR' --> SPINE MET (TR) 8B0000 REDS
                 'EEEE00', #'RU675_NORMAL_1ANS_UTR' --> NORMAL EEEE00 YELLOWS
                 '4682B4', #'RU675_TUMOR_1ANS_UTR' --> PRIMARY 4682B4 steel blue
                 '63B8FF', #'RU676_TUMOR_1AS_UTR' --> PRIMARY 63B8FF steel blue 1
                 '9400D3', #'RU679_TUMOR_2AS_TR' --> PRIMARY (TR) 9400D3 DARK VIOLET
                 '33A1C9', #'RU680_TUMOR_1AS_UTR' --> PRIMARY 33A1C9 peacock
                 'DC143C', #'RU681_BRAINMET_4S_TR' --> BRAIN MET (TR) CD0000 REDS
                 'CDCD00', #'RU682_NORMAL_2AS_UTR' --> NORMAL CDCD00 YELLOWS
                 '00C5CD', #'RU682_TUMOR_2AS_UTR' --> PRIMARY 00C5CD turquois
                 '8B8B00', #'RU684_NORMAL_1AS_UTR'--> NORMAL 8B8B00 YELLOWS
                 '03A89E', #'RU684_TUMOR_1AS_UTR' --> PRIMARY 03A89E maganese blue
                 '808069', #'RU685_NORMAL_1ANS_UTR' --> NORMAL 808069 YELLOWS
                 'FF1493', # RU699_ADRENAL_MET
                 'FF4500', #RU701_BRAINMET
]

# CATEGORICAL: NORMAL SAMPLES
FLATUI_NORMAL = [ 
                 
                 'EEEE00', #'RU675_NORMAL_1ANS_UTR' --> NORMAL EEEE00 YELLOWS
                 'CDCD00', #'RU682_NORMAL_2AS_UTR' --> NORMAL CDCD00 YELLOWS
                 '8B8B00', #'RU684_NORMAL_1AS_UTR'--> NORMAL 8B8B00 YELLOWS
                 '808069', #'RU685_NORMAL_1ANS_UTR' --> NORMAL 808069 YELLOWS
]

FLATUI_CANCERandNORMAL = [ 
                 '551A8B', #'RU653_TUMOR_1AS_UTR' --> PRIMARY 551A8B purple
                 '0000EE', #'RU661_TUMOR_1AS_UTR' -- PRIMARY 0000EE blue
                 'EEEE00', #'RU675_NORMAL_1ANS_UTR' --> NORMAL EEEE00 YELLOWS
                 '4682B4', #'RU675_TUMOR_1ANS_UTR' --> PRIMARY 4682B4 steel blue
                 '63B8FF', #'RU676_TUMOR_1AS_UTR' --> PRIMARY 63B8FF steel blue 1
                 '9400D3', #'RU679_TUMOR_2AS_TR' --> PRIMARY (TR) 9400D3 DARK VIOLET
                 '33A1C9', #'RU680_TUMOR_1AS_UTR' --> PRIMARY 33A1C9 peacockEDS
                 'CDCD00', #'RU682_NORMAL_2AS_UTR' --> NORMAL CDCD00 YELLOWS
                 '00C5CD', #'RU682_TUMOR_2AS_UTR' --> PRIMARY 00C5CD turquois
                 '8B8B00', #'RU684_NORMAL_1AS_UTR'--> NORMAL 8B8B00 YELLOWS
                 '03A89E', #'RU684_TUMOR_1AS_UTR' --> PRIMARY 03A89E maganese blue
                 '808069', #'RU685_NORMAL_1ANS_UTR' --> NORMAL 808069 YELLOWS
]

FLATUI_SOURCE = [

                 'FF0000', # MET
                 'E6E6FA', # NORMAL
                 '000000', # PRIMARY
]

FLATUI_PATIENT = [
                 'FF4040', # 255B
                 '551A8B', # 653
                 '0000EE', # 661
                 '8B0000', # 666
                 '4682B4', # 675
                 '63B8FF', # 676
                 'CD00CD', # 679
                 '9400D3', # 680
                 '33A1C9', # 681
                 'CD0000', # 682
                 '00C5CD', # 684
                 '03A89E', # 685
                 '808069', # 699
                 '99FFFF', # 701
]

FLATUI_CLASS = [
                 '0000FF', # 0
                 'FF0000', # 1   
                 'FF8C00', # 2
                 'FFD700', # 3  
                 '808000', # 4
                 '9ACD32', # 5   
                 '44F544', # 6
                 '20B2AA', # 7  
                 '2F4F4F', # 8
                 '00BFFF', # 9   
                 '147ADF', # 10
                 '191970', # 11 
                 '6200A4', # 12
                 '8B008B', # 13  
                 'FF00FF', # 14
                 'FF1493', # 15 
                 '8B4513', # 16
                 'D2691E', # 17 
                 'B0C4DE', # 18
                 '696969', # 19
                 'FF586E', # 20
                 'FAEBD7', # 21
                 'E2C792', # 22
                 'FFF7A4', # 23
                 '83715A', # 24
                 'BC8F8F', # 25
                 '4C718E', # 26
                 '519395', # 27
                 '7FFFD4', # 28  
                 '0F830F', # 29
                 'D4FFF0', # 30
                 'B72222', # 31
                 'FDBABA', # 32
                 '1B1010', # 33
                 'CD57FF', # 34
                 'F0FFFF', # 35
        
]

# UPDATE COLORMAP
FLATUI_CELL_TYPE = [
                 '014E4E', # Breg
                 '800080', # DENDRITIC
                 '490049', # DENDRITIC (ACTIVATED)
                 '0000FF', # ENDOTHELIAL
                 'BC8F8F', # EPITHELIAL
                 '00994C', # FIBROBLAST
                 '000000', # IG
                 '000080', # MACROPHAGE
                 'B0C4DE', # MAST
                 'FF00FF', # MDSC
                 '0080FF', # MICROGLIA/MACROPHAGE
                 'D8BFD8', # MONOCYTE
                 '983DFF', # MONOCYTE (ACTIVATED)
                 'FF0000', # NK
                 '800000', # NKT
                 '00CCCC', # PERICYTE
                 '00FF7F', # PROLIFERATING MESENCHYMAL PROGENITOR
                 'FFD700', # Th
                 '994C00', # Tm
                 'FF8000', # Treg
]



# METACELL TYPE TO HEX
FLATUI_METACELL = [
                   '00008B', # LYMPHOID
                   '8B008B', # MYELOID
                   'BC8F8F', # OTHER
                   ]

FLATUI_MYELOID_OTHER_ALL_PHENOGRAPH_CLASS = [
                 '0000FF', # 0
                 'FF0000', # 1   
                 'FF8C00', # 2
                 'FFD700', # 3  
                 '808000', # 4
                 '9ACD32', # 5   
                 '44F544', # 6
                 '20B2AA', # 7  
                 '2F4F4F', # 8
                 '00BFFF', # 9   
                 '147ADF', # 10
                 '191970', # 11 
                 '6200A4', # 12
                 '8B008B', # 13  
                 'FF00FF', # 14
                 'FF1493', # 15 
                 '8B4513', # 16
                 'B0C4DE', # 17
                 '606060', # 18 
                 '696969', # 19
                 'FF586E', # 20
                 'FAEBD7', # 21
                 'E2C792', # 22
                 'FFF7A4', # 23
                 '83715A', # 24
                 'BC8F8F', # 25
                 '4C718E', # 26
                 '519395', # 27
                 '7FFFD4', # 28  
                 '0F830F', # 29
                 'D4FFF0', # 30
                 'B72222', # 31
                 'FDBABA', # 32
                 '1B1010', # 33
                 'CD57FF', # 34
                 'CCFFFF', # 34 **
                 'CD853F', # 36
                 'FF7F50', # 37
                 'C692FF', #38
                 '433355', # 39
        
]

FLATUI_CELL_TYPE_MYELOID_OTHER_ALL = [
                 '800080', # DENDRITIC (purple)
                 '490049', # DENDRITIC ACTIVATED (purple)
                 '0000FF', # ENDOTHELIAL (RED)
                 'BC8F8F', # EPITHELIAL (tan)
                 '00994C', # FIBROBLAST (dark green
                 '000080', # MACROPHAGE (blue)
                 'B0C4DE', # MAST (purple/gray)
                 'FF00FF', # MDSC (light blue/grey)
                 '0080FF', # MICROGLIA/MACROPHAGE
                 'D8BFD8', # MONOCYTE (magenta))
                 '983DFF', # MONOCYTE ACTIVATED(magenta))
                 '00CCCC', # PERICYTE (bright blue)
                 '00FF7F', # PROLIFERATING MESENCHYMAL PROGENITOR (bright green)
]

FLATUI_LYMPHOID_ALL_PHENOGRAPH_CLASS = [
                 '0000FF', # 0
                 'FF0000', # 1   
                 'FFD700', # 2
                 '808000', # 3 
                 '006400', # 4
                 '20B2AA', # 5 
                 '00BFFF', # 6   
                 '1E90FF', # 7
                 '4B0082', # 8 
                 '8B008B', # 9 
                 'FF1493', # 10
                 'D2691E', # 11 
                 'B0C4DE', # 12 
                 'FAEBD7', # 14
                 'FFFACD', # 15
                 'D2B48C', # 16
                 '5F9EA0', # 18
                 '8FBC8F', # 19
                 '00FA9A', # 20
                 '66CDAA', # 21
                 '3CB371', # 22  
]

FLATUI_CELL_TYPE_LYMPHOID_ALL = ['014E4E', # Breg 
                                 '000000', # IG
                                 'FF0000', # NK
                                 '800000', # NKT
                                 'FFD700', # Th
                                 '994C00', # Tm
                                 'FF8000', # Treg
                                ]

# UPDATE COLOR MAP TO ASSOCIATE LINEAGE COLOR WITH EACH CLASS
FLATUI_CLASS_NORMAL_LINEAGES = [
                 'FF66FF', # AEC1, CLASS 2 
                 '6600CC', # AEC2, CLASS 0
                 '3399FF', # CILIATED, CLASS 1
                 'FF8800', # CLUB/CLARA, CLASS 3
]

FLATUI_PHENOGRAPH_CLASS_NORMAL = [
                 '6600CC', # AT2, CLASS 0
                 '3399FF', # CILIATED, CLASS 1
                 'FF66FF', # AT1, CLASS 2 
                 'FF8800', # CLUB/CLARA, CLASS 3
]

# DEFINE LINEAGE COLORMAP
FLATUI_LINEAGE = [
                 'FF66FF', # AEC1 pink
                 '6600CC', # AEC2 purple
                 '9ACD32', # AEP/MUCINOUS
                 '3399FF', # CILIATED skye blue
                 'FF8800', # CLUB orange
                 '0000FF', # KRT5+BASAL (MESENCHYMAL/KRAS HIGH),#'0000FF' dark blue
                 'FFD700', # PNEC
                 'E6E6FA', # STEM/PROGENITOR
]

FLATUI_SOURCE_RX = [
                 'FF0000', # MET
                 'E6E6FA', # NORMAL
                 '000000', # PRIMARY
                 'FF8000', # PRIMARY_RX
]

FLATUI_CLASS_ALL_EP = [
                 '0000FF', # 0 BLUE
                 'FF0000', # 1    RED
                 'FF8C00', # 2 ROANGE
                 'FFD700', # 3   YELLOW
                 '808000', # 4 Dark yellow
                 '9ACD32', # 5  Lime Green
                 '006400', # 6  Dark green
                 '20B2AA', # 7  Teal
                 '2F4F4F', # 8 GREY DARK
                 '00BFFF', # 9  CYAN
                 '191970', # 11  DARK PURPLE
                 '4B0082', # 12 LIGHTER PURPLE
                 '8B008B', # 13  PINK-PURPLE
                 'FF00FF', # 14 MAGENTA
                 'FF1493', # 15 CHERRY
                 '8B4513', # 16 BROWN
                 'B0C4DE', # 19 LIGHT BLUEGREY  
                 '696969', # 20
                 'FFB6C1', # 21 #SALMON
                 'FAEBD7', # 22 # TAN
                 'F5DEB3', # 23
                 '00CCCC', # 24 TEAL
                 'BC8F8F', # 26 
]

# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_CLASS_ALL_EP),3))
for ind,hexcolor in enumerate(FLATUI_CLASS_ALL_EP):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_CLASS_ALL_EP = LinearSegmentedColormap.from_list('FLATUI_CLASS_ALL_EP', colors, N=len(colors))

# CONVERT HEX TO RGB (FLATUI_SAMPLE)
colors = np.zeros((len(FLATUI_SAMPLE),3))
for ind,hexcolor in enumerate(FLATUI_SAMPLE):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_SAMPLES = LinearSegmentedColormap.from_list('FLATUI_SAMPLE', colors, N=len(colors))

# CONVERT HEX TO RGB (FLATUI_NORMAL)
colors = np.zeros((len(FLATUI_NORMAL),3))
for ind,hexcolor in enumerate(FLATUI_NORMAL):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_NORMAL = LinearSegmentedColormap.from_list('FLATUI_NORMAL', colors, N=len(colors))

# CONVERT HEX TO RGB (FLATUI_CANCER & NORMAL)
colors = np.zeros((len(FLATUI_CANCERandNORMAL),3))
for ind,hexcolor in enumerate(FLATUI_CANCERandNORMAL):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_CANCERandNORMAL = LinearSegmentedColormap.from_list('FLATUI_CANCERandNORMAL', colors, N=len(colors))
                                              
    
# CONVERT HEX TO RGB (FLATUI_SOURCE)
colors = np.zeros((len(FLATUI_SOURCE),3))
for ind,hexcolor in enumerate(FLATUI_SOURCE):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_SOURCE = LinearSegmentedColormap.from_list('FLATUI_SOURCE', colors, N=len(colors))

# CONVERT HEX TO RGB (FLATUI_PATIENT)
colors = np.zeros((len(FLATUI_PATIENT),3))
for ind,hexcolor in enumerate(FLATUI_PATIENT):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_PATIENT = LinearSegmentedColormap.from_list('FLATUI_PATIENT', colors, N=len(colors))

# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_CLASS),3))
for ind,hexcolor in enumerate(FLATUI_CLASS):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_CLASS = LinearSegmentedColormap.from_list('FLATUI_CLASS', colors, N=len(colors))

# CONVERT HEX TO RGB (FLATUI_METACELL)
colors = np.zeros((len(FLATUI_METACELL),3))
for ind,hexcolor in enumerate(FLATUI_METACELL):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_METACELL = LinearSegmentedColormap.from_list('FLATUI_METACELL', colors, N=len(colors))

# CONVERT HEX TO RGB (FLATUI_SAMPLE)
colors = np.zeros((len(FLATUI_CELL_TYPE),3))
for ind,hexcolor in enumerate(FLATUI_CELL_TYPE):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_CELL_TYPE = LinearSegmentedColormap.from_list('FLATUI_CELL_TYPE', colors, N=len(colors))

# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_MYELOID_OTHER_ALL_PHENOGRAPH_CLASS),3))
for ind,hexcolor in enumerate(FLATUI_MYELOID_OTHER_ALL_PHENOGRAPH_CLASS):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_MYELOID_OTHER_ALL_PHENOGRAPH_CLASS = LinearSegmentedColormap.from_list('FLATUI_MYELOID_OTHER_ALL_PHENOGRAPH_CLASS', colors, N=len(colors))

# CONVERT HEX TO RGB (FLATUI_SAMPLE)
colors = np.zeros((len(FLATUI_CELL_TYPE_MYELOID_OTHER_ALL),3))
for ind,hexcolor in enumerate(FLATUI_CELL_TYPE_MYELOID_OTHER_ALL):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_CELL_TYPE_MYELOID_OTHER_ALL = LinearSegmentedColormap.from_list('FLATUI_CELL_TYPE_MYELOID_OTHER_ALL', colors, N=len(colors))

# CONVERT HEX TO RGB (FLATUI_SAMPLE)
colors = np.zeros((len(FLATUI_LYMPHOID_ALL_PHENOGRAPH_CLASS),3))
for ind,hexcolor in enumerate(FLATUI_LYMPHOID_ALL_PHENOGRAPH_CLASS):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_LYMPHOID_ALL_PHENOGRAPH_CLASS = LinearSegmentedColormap.from_list('FLATUI_LYMPHOID_ALL_PHENOGRAPH_CLASS', colors, N=len(colors))

# CONVERT HEX TO RGB (FLATUI_SAMPLE)
colors = np.zeros((len(FLATUI_CELL_TYPE_LYMPHOID_ALL),3))
for ind,hexcolor in enumerate(FLATUI_CELL_TYPE_LYMPHOID_ALL):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_CELL_TYPE_LYMPHOID_ALL = LinearSegmentedColormap.from_list('FLATUI_CELL_TYPE_LYMPHOID_ALL', colors, N=len(colors))

# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PHENOGRAPH_CLASS_NORMAL),3))
for ind,hexcolor in enumerate(FLATUI_PHENOGRAPH_CLASS_NORMAL):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_PHENOGRAPH_CLASS_NORMAL = LinearSegmentedColormap.from_list('FLATUI_PHENOGRAPH_CLASS_NORMAL', colors, N=len(colors))

# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_CLASS_NORMAL_LINEAGES),3))
for ind,hexcolor in enumerate(FLATUI_CLASS_NORMAL_LINEAGES):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_CLASS_NORMAL_LINEAGES = LinearSegmentedColormap.from_list('FLATUI_CLASS_NORMAL_LINEAGES', colors, N=len(colors))

# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_LINEAGE),3))
for ind,hexcolor in enumerate(FLATUI_LINEAGE):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_LINEAGE = LinearSegmentedColormap.from_list('FLATUI_LINEAGE', colors, N=len(colors))

# CONVERT HEX TO RGB (FLATUI_METACELL)
colors = np.zeros((len(FLATUI_SOURCE_RX),3))
for ind,hexcolor in enumerate(FLATUI_SOURCE_RX):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_SOURCE_RX = LinearSegmentedColormap.from_list('FLATUI_SOURCE_RX', colors, N=len(colors))

# CONTINUOUS: DIVERGING
CM_DIVERGING = plt.cm.RdBu_r

# CONTINUOUS: SEQUENTIAL
CM_SEQUENTIAL = cmocean.cm.thermal 

Global cell atlas of normal, primary and metastatic lung adenocarcinoma

Description

We profiled the transcriptomes of 40,505 single cells obtained from 17 freshly resected human tissue samples comprising adjacent non-tumour involved lung (n = 4, hereafter referred to as normal lung), primary lung adenocarcinoma (n = 8; 7 untreated and 1 post neo-adjuvant chemotherapy), as well as brain (n = 3), bone (n = 1), and adrenal (n = 1) lung adenocarcinoma metastases (Fig. 1a). These samples were derived from patients spanning various stages of tumour progression (Extended Data Table 1) without enrichment for a specific cell type, such that the viable tumour sample and its microenvironment were sampled in an unbiased manner. scRNA-seq data from all patients were merged to create a global cell atlas of the normal lung, primary tumours, and metastases. In this section, we walk through cell type annotation within this global atlas.

Load data

Processed data is stored as H5 with the following files, sharing common multi-index

In [9]:
# SPECIFY DATA SUBSET
subset_type = 'ALL'
In [10]:
# LOAD DATA
# RAW COUNTS (Cells x Genes)
exec('DF_{} = h5_data.load(\'/DF_{}\')'.format(subset_type,subset_type))

# MEDIAN LIBRARY-SIZE NORMALIZED COUNTS (Cells x Genes)
exec('NDF_{} = h5_data.load(\'/NDF_{}\')'.format(subset_type,subset_type))

# MAGIC IMPUTED AND NORMALIZED DATA (Cells x Genes)
exec('INDF_{} = h5_data.load(\'/INDF_{}\')'.format(subset_type,subset_type))

# CELLS X DIMENSIONS (including principal components, diffusion components, ect.)
exec('DIMENSIONS_{} = h5_data.load(\'/DIMENSIONS_{}\')'.format(subset_type,subset_type))

# RANKED DIFFERENTIALLY EXPRESSED GENES BY MAST (EACH CELL TYPE VS. ALL OTHER CELLS)
exec('GENE_RANK_MAST_{} = h5_data.load(\'/GENE_RANK_MAST_{}\')'.format(subset_type,subset_type))

Evaluate library parameters

In [11]:
exec('QUERY = DF_{}'.format(subset_type))

plt.figure(figsize = (9,3))
gs1 = gridspec.GridSpec(1,3)
gs1.update(wspace=0.5, hspace=0.7) # set the spacing between axes. 

# (1) PLOT LOG10 LIBRARY SIZE
ax = plt.subplot(gs1[0])
bins = np.linspace(np.log10(QUERY.values.sum(axis=1).min()), np.log10(QUERY.values.sum(axis=1).max())*0.95, 100)
for ind, label in enumerate(np.unique(QUERY.index.get_level_values('Legend'))):
    x = np.log10(QUERY.select(lambda x: x[1] in label).values.sum(axis=1)) # log cell size
    plt.hist(x, bins, alpha=0.5, label=label,color = np.divide(tuple(hex(FLATUI_SAMPLE[ind]).rgb),255)) 
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('\nLog10 Cell Size')
plt.grid(True)
sns.despine()

# (2) PLOT LOG NUM UNIQUE GENES
ax = plt.subplot(gs1[1])
bins = np.linspace(np.log10((np.sum(QUERY > 0,axis=1)).min()), 
                   np.log10((np.sum(QUERY > 0,axis=1)).max())*0.95, 100)
for ind, label in enumerate(np.unique(QUERY.index.get_level_values('Legend'))):
    x = np.log10(np.sum(QUERY.select(lambda x: x[1] in label) > 0,axis=1))
    plt.hist(x, bins, alpha=0.5, label=label,color = np.divide(tuple(hex(FLATUI_SAMPLE[ind]).rgb),255)) 
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('\nLog10 Unique genes')
plt.grid(True)
sns.despine()

# (3) PLOT LOG NUMBER OF CELLS CONTRIBUTING TO EACH GENE
ax = plt.subplot(gs1[2])
data = np.log10(np.sum(QUERY.values > 0,axis=0))
bins = np.linspace(0, data.max()*0.95, 20)
for ind, label in enumerate(np.unique(QUERY.index.get_level_values('Legend'))):
    x = np.log10(np.sum(QUERY.select(lambda x: x[1] in label) > 0,axis=0))
    x[(np.isinf(x)) | (np.isnan(x))] = 0
    plt.hist(x, bins, alpha=0.5, label=label,color = np.divide(tuple(hex(FLATUI_SAMPLE[ind]).rgb),255)) 
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('\nLog. Cells/Gene')
plt.grid(True)
sns.despine()

# Add (abbreviated) legend bottom left
L = plt.legend(loc='upper right',prop={'size':6},bbox_to_anchor=(2, 0.95),fancybox=True) 

# SAVE FIGURE
figure_label = '_library_distribution_subset_{}_log10'.format(subset_type)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label + '.png'
plt.savefig(fn, dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_library_distribution_subset_ALL_log10.png

Meta-cell type annotation

In [12]:
# LOAD CANONICAL CELL TYPE MARKERS FROM EXCEL FILE 
# THIS CORRESPONDS TO SUPPLEMENTARY_TABLE2/TAB1 (Gene Signatures from ED Fig. 1i)
filename = 'CELL_TYPES_WANSLEEBEN_HOGAN_2013'
path_to_genesets = DATA_PATH+'{}.csv'.format(filename)
genesets = pd.read_csv(path_to_genesets,header='infer')
print(shape(genesets)[1])
63
In [13]:
# ROW INDEX
meta = 'META_CELL_TYPE'
FLATUI_PLOT = FLATUI_METACELL
title = os.path.basename(path_to_genesets)[:-4]

# GENERATE MATRIX OF CUMULATIVE EXPRESION PER CELL TYPE
exec('QUERY = INDF_{}'.format(subset_type))
titles = list(genesets.columns)#['EPITHELIAL','MESENCHYAL','CD45']
tmp = pd.DataFrame(index = QUERY.index)
for title in titles:
    genes = genesets.values[:,np.where(genesets.columns==title)[0][0]] # genes x pathways
    genes = [x for x in genes if str(x) != 'nan']
    detected_genes = list(set(genes).intersection(set(QUERY.columns)))
    vals = QUERY[detected_genes]
    exec('tmp[\'{}_SCORE\'] = (np.nansum(vals,axis=1))'.format(title))
genes = list(tmp.columns)

# ROW COLORS
ind = QUERY.index.get_level_values(meta)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors = pd.Series(ind).map(lut)

# CONSTRUCT HEATMAP DATA
heatmap_data = pd.DataFrame(data = zscore(tmp.values,axis=0),columns = genes)
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
method = 'average' # average, single
metric = 'cosine' # cosine
linkage = hc.linkage(heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)

# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
cl = hc.leaves_list(col_linkage)
mat = heatmap_data.iloc[r1,cl]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(18,10))

# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.088,0.1,0.01,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors[r1]:
    pos = (x, y / len(r1))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(r1), color=c))
    if y >= len(r1)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.1,0.1,0.7,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=plt.cm.RdBu_r,vmin=-3,vmax=3)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 14)

# ADD DENDROGRAM
sch.set_link_color_palette(['#808080', '#808080', '#808080', '#808080','#808080','#808080','#808080'])
ax2 = fig.add_axes([0.01,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#808080')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')

# ADD COLORBAR
axcolor = fig.add_axes([0.81,0.1,0.01,0.6])
cbar = plt.colorbar(im, cax=axcolor)
cbar.ax.get_yaxis().set_ticks([])

# SAVE FIGURE
figure_label = 'heatmaps/user_defined/_{}_{}_HEATMAP_{}_{}_{}'.format(meta,subset_type,title,method,metric)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
d = os.path.dirname(fn)
if not os.path.exists(d):
    os.makedirs(d)
    
fig.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDheatmaps/user_defined/_META_CELL_TYPE_ALL_HEATMAP_EARLY_MESODERM_SECRETED_average_cosine
In [14]:
# SPECIFY SUBSET
subset_type = 'ALL'
dimtype = 'IPC_TSNE' # TSNE COMPUTED ON PRINCIPAL COMPONENTS ON THE INDF
dot_size = 1
exec('DIM = DIMENSIONS_{}'.format(subset_type))

# XY Parameters
plt.figure(figsize = (15,5))
gs1 = gridspec.GridSpec(1,3)
gs1.update(wspace=0.1, hspace=0.1) # set the spacing between axes. 
dot_size = 1

x = DIM['{}0'.format(dimtype)]
y = DIM['{}1'.format(dimtype)]
xmax = np.abs(x).max()
ymax = np.abs(y).max()
axis_min = -ceil(ceil(max(xmax,ymax)/10)*10)
axis_max = ceil(ceil(max(xmax,ymax)/10)*10)

# SAMPLE ID: LEGEND
ax = plt.subplot(gs1[0])
seqc.plot.scatter.categorical(x, y, c=DF_ALL.index.get_level_values('Legend'), 
                              cmap=CM_SAMPLES,
                              legend_kwargs={'ncol': 1, 'borderaxespad':0.,
                                            'bbox_to_anchor':(1, 0)},
                              s=dot_size, ax=ax);
sns.despine()
plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
plt.title('Sample ID', fontname='Helvetica', size=12, weight='normal')
plt.axis('off')
ax.set_facecolor('white')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)

# METACELL TYPE ASSIGNMENT: LYMPHOID/MYELOID/OTHER(STROMA/EPITHELIUM)
ax = plt.subplot(gs1[1])
seqc.plot.scatter.categorical(x, y, c=DF_ALL.index.get_level_values('META_CELL_TYPE'), 
                              cmap=CM_METACELL,
                              legend_kwargs={'ncol': 1, 'borderaxespad':0.,
                                            'bbox_to_anchor':(1, 0)},
                              s=dot_size, ax=ax);
sns.despine()
plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
plt.title('META_CELL_TYPE', fontname='Helvetica', size=12, weight='normal')
plt.axis('off')
ax.set_facecolor('white')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)

# SOURCE: NORMAL/PRIMARY TUMOR/ETASTASIS
ax = plt.subplot(gs1[2])
seqc.plot.scatter.categorical(x, y, c=DF_ALL.index.get_level_values('Meta-Source'), 
                              cmap= CM_SOURCE,
                              legend_kwargs={'ncol': 1, 'borderaxespad':0.,
                                            'bbox_to_anchor':(1, 0)}, 
                              s=dot_size, ax=ax,randomize=True);
plt.rc('xtick', labelsize=6)
plt.rc('ytick', labelsize=6)
sns.despine()
plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
plt.title('SOURCE', fontname='Helvetica', size=12, weight='normal')
plt.axis('off')
ax.set_facecolor('white')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)

# SAVE FIGURE
figure_label = '_{}_META_CELL_TYPE_AND_SOURCE_{}'.format(subset_type,dimtype)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_ALL_META_CELL_TYPE_AND_SOURCE_IPC_TSNE
In [15]:
h5_data.ls()
['/DF_ALL', '/DF_EPITHELIAL_NOR', '/DF_EPITHELIAL_NOR_TUMOR', '/DF_EPITHELIAL_NOR_TUMOR_MET', '/DF_LYMPHOID_ALL', '/DF_MYELOID_OTHER_ALL', '/DIMENSIONS_ALL', '/DIMENSIONS_EPITHELIAL_NOR', '/DIMENSIONS_EPITHELIAL_NOR_TUMOR', '/DIMENSIONS_EPITHELIAL_NOR_TUMOR_MET', '/DIMENSIONS_LYMPHOID_ALL', '/DIMENSIONS_MYELOID_OTHER_ALL', '/GENE_RANK_MAST_ALL', '/GENE_RANK_MAST_EPITHELIAL_NOR', '/GENE_RANK_MAST_EPITHELIAL_NOR_TUMOR', '/GENE_RANK_MAST_EPITHELIAL_NOR_TUMOR_MET', '/GENE_RANK_MAST_LYMPHOID_ALL', '/GENE_RANK_MAST_MYELOID_OTHER_ALL', '/INDF_ALL', '/INDF_EPITHELIAL_NOR', '/INDF_EPITHELIAL_NOR_TUMOR', '/INDF_EPITHELIAL_NOR_TUMOR_MET', '/INDF_LYMPHOID_ALL', '/INDF_MYELOID_OTHER_ALL', '/NDF_ALL', '/NDF_EPITHELIAL_NOR', '/NDF_EPITHELIAL_NOR_TUMOR', '/NDF_EPITHELIAL_NOR_TUMOR_MET', '/NDF_LYMPHOID_ALL', '/NDF_MYELOID_OTHER_ALL']
In [16]:
#DF_EPITHELIAL_NOR_TUMOR_MET.shape

Phenograph clustering and cell type annotation within meta-cell types

Myeloid/Epithelium/Stroma

In [17]:
# SPECIFY DATA SUBSET
subset_type = 'MYELOID_OTHER_ALL'

Load data subset

In [18]:
# LOAD DATA
# RAW COUNTS (Cells x Genes)
exec('DF_{} = h5_data.load(\'/DF_{}\')'.format(subset_type,subset_type))

# MEDIAN LIBRARY-SIZE NORMALIZED COUNTS (Cells x Genes)
exec('NDF_{} = h5_data.load(\'/NDF_{}\')'.format(subset_type,subset_type))

# MAGIC IMPUTED AND NORMALIZED DATA (Cells x Genes)
exec('INDF_{} = h5_data.load(\'/INDF_{}\')'.format(subset_type,subset_type))

# CELLS X DIMENSIONS (including principal components, diffusion components, ect.)
exec('DIMENSIONS_{} = h5_data.load(\'/DIMENSIONS_{}\')'.format(subset_type,subset_type))

# RANKED DIFFERENTIALLY EXPRESSED GENES BY MAST (EACH CELL TYPE VS. ALL OTHER CELLS)
exec('GENE_RANK_MAST_{} = h5_data.load(\'/GENE_RANK_MAST_{}\')'.format(subset_type,subset_type))

Visualize Phenograph clusters and cell type assignments

In [19]:
dimtype = 'IPC_TSNE' # TSNE COMPUTED ON PRINCIPAL COMPONENTS ON THE INDF
meta = 'CELL_TYPE'
exec('QUERY = INDF_{}'.format(subset_type))
exec('DIM = DIMENSIONS_{}'.format(subset_type))
x = DIM['{}0'.format(dimtype)]
y = DIM['{}1'.format(dimtype)]

# XY Parameters
plt.figure(figsize = (15,5))
gs1 = gridspec.GridSpec(1,3)
gs1.update(wspace=0.1, hspace=0.1) # set the spacing between axes. 
dot_size = 2

xmax = np.abs(x).max()
ymax = np.abs(y).max()
axis_min = -ceil(ceil(max(xmax,ymax)/10)*10)
axis_max = ceil(ceil(max(xmax,ymax)/10)*10)

# SAMPLE ID: LEGEND
ax1 = plt.subplot(gs1[0])
seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('Legend'), 
                              cmap=CM_SAMPLES,
                              legend_kwargs={'ncol': 1, 'borderaxespad':0.,
                                            'bbox_to_anchor':(1, 0)}, 
                              s=dot_size, ax=ax1);
sns.despine()
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
sns.despine()

# PHENOGRAPH CLUSTERS
ax2 = plt.subplot(gs1[1])
numeric_labels = [int(label[:-14]) for label in QUERY.index.get_level_values('PHENOGRAPH_CLASS')]
g = seqc.plot.scatter.categorical(x, y, c=numeric_labels, 
                                  cmap= CM_MYELOID_OTHER_ALL_PHENOGRAPH_CLASS, 
                                  legend_kwargs={'ncol': 3, 'borderaxespad':0.,
                                            'bbox_to_anchor':(0.75, 0)},
                                  s=dot_size,randomize=True, ax = ax2);
ax2.tick_params(labelsize=14)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax2.axis('off')

# CELL TYPE ASSIGNMENTS
ax3 = plt.subplot(gs1[2])
g = seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('CELL_TYPE'), 
                                  cmap= CM_CELL_TYPE_MYELOID_OTHER_ALL, 
                                  legend_kwargs={'ncol': 1, 'borderaxespad':0.,
                                            'bbox_to_anchor':(1, 0)},
                                  s=dot_size,randomize=True, ax = ax3);
h,l=g.get_legend_handles_labels()
ax3.tick_params(labelsize=14)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax3.axis('off')

# SAVE FIGURE
figure_label = '_{}_LEGEND_PHENOGRAPH_CELLTYPE_{}'.format(subset_type,dimtype)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_MYELOID_OTHER_ALL_LEGEND_PHENOGRAPH_CELLTYPE_IPC_TSNE

Query marker expression by cell type (dot plot of un-imputed data)

In [20]:
# SPECIFY GROUPBY PARAMETER AND ASSOCIATED COLORMAP
meta='CELL_TYPE'
FLATUI_PLOT = FLATUI_CELL_TYPE_MYELOID_OTHER_ALL

# SPECIFY DATA AND GROUP CELLS BY THIS PARAMETER
exec('QUERY = NDF_{}'.format(subset_type))
     
# RANK GROUPED CELLS BY CATEGORIES (ORDERED)
categories = ['EPITHELIAL','PROLIFERATING MESENCHYMAL PROGENITOR','FIBROBLAST','PERICYTE','ENDOTHELIAL',
              'MONOCYTE', 'NEUTROPHIL','MDSC','MACROPHAGE',
              'MICROGLIA/MACROPHAGE','DENDRITIC', 'DENDRITIC (ACTIVATED)','MAST']

marker_genes_dict = {
                     'EPITHELIAL': ['CDH1','SOX9','SOX2'],
                     'STROMA': ['VIM', 'ACTA2','DES','PDGFRA','PDGFRB','CDH5','PECAM1','CD34'],
                     'MONOCYTE': ['FCGR3A','ITGAX','CD14'],
                     'DENDRITIC':['CD40','CD80','HLA-DOA', 'HLA-DOB','HLA-DQA1','HLA-DQA2','HLA-DQB1'],
                     'MHC2': ['HLA-DPA1','HLA-DPB1', 'HLA-DQA1','HLA-DQA2','HLA-DQB1', 'HLA-DRA','HLA-DRB1'],
                     'MACROPHAGE': ['LAMP2','LGALS3','MRC1','TFRC','FCGR2A','ITGAM','CD163'],
                     'NEUTROPHIL': ['LY6G6D','S100A9','S100A8','S100A12','PILRA','LILRB2','ISG15'],
                     'MDSC':['CYBB','CSF1R','STAT1','CD14','FCGR1A','FCGR1B','CD33','CCR1'],
                     'MAST':['KIT'] #,'ENPP3'
                    }
# SPECIFY ORDER OF PLOT GENES
var_group_labels = ['EPITHELIAL','STROMA','MONOCYTE','NEUTROPHIL','MDSC','MACROPHAGE','DENDRITIC','MAST']

# CONVERT CATEGORIZED GENE DICTIONARY TO MARKER_GENES WITH POSITION ANNOTATED
tmp = pd.DataFrame()
for gene_group in var_group_labels:
    genes = marker_genes_dict[gene_group]
    genes = [gene for gene in genes if gene in QUERY.columns]
    labels = [gene_group] * len(genes)
    current = pd.DataFrame({'labels':labels,'genes':genes})
    tmp = tmp.append(current)
tmp.index = np.arange(len(tmp))
marker_genes = tmp['genes'].values

var_group_positions = []
for gene_group in var_group_labels:
    cpositions = list(tmp[tmp['labels'] ==gene_group].index)
    var_group_positions.append((cpositions[0],cpositions[len(cpositions)-1]))

# DOT PLOT OF GENES LABELED BY CATEGORY
ax = dotplot(QUERY[marker_genes], marker_genes, categories, groupby=meta, use_raw=None, log=False,
            expression_cutoff=0, mean_only_expressed=True, color_map='Reds', dot_max=0.75,
            dot_min=0.10, figsize=None, dendrogram=False, gene_symbols=None,
            var_group_positions=var_group_positions, standard_scale = 'var', smallest_dot=0,
            var_group_labels=var_group_labels, var_group_rotation=90, layer=None, show=None,
            save=None)

# SAVE FIGURE
figure_label = '_{}_dotPlot_markerGenes_by{}'.format(subset_type,meta) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_MYELOID_OTHER_ALL_dotPlot_markerGenes_byCELL_TYPE

Query marker expression by cell type (matrix plot of imputed data)

In [21]:
# SPECIFY GROUPBY PARAMETER AND ASSOCIATED COLORMAP
meta='CELL_TYPE' 
FLATUI_PLOT = FLATUI_CELL_TYPE_MYELOID_OTHER_ALL 

# ROW INDEX
genes = marker_genes
exec('QUERY = INDF_{}'.format(subset_type))

# CONVERT HEX TO RGB
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)

# SELECT UNIQUE, DE GENES
genes,ind = np.unique(genes, return_index=True)

# CONSTRUCT HEATMAP DATA
genes = [g for g in genes if g in list(QUERY.columns)] # Detected genes
heatmap_data = pd.DataFrame(data = zscore(QUERY[genes].values,axis=0),columns = genes, index = QUERY.index).groupby(level= meta, axis=0).median() #CLUSTERS

yticks = heatmap_data.index
xticks = heatmap_data.columns

# COLOR ROWS BY <meta>
ind = list(heatmap_data.index)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors = pd.Series(ind,index = ind).map(lut)

# LINKAGE 
method = 'average' 
metric = 'correlation' 
linkage = hc.linkage(heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)

# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
cl = hc.leaves_list(col_linkage)
mat = heatmap_data.iloc[r1,cl]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(12,5))
plt.rcParams["axes.grid"] = False
import matplotlib.patches as patches

# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.080,0.1,0.02,0.6])
x = 0
y = 0
for c in row_colors[list(mat.index)]:
    pos = (x, y / len(mat.index))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(mat.index), color=c))
    if y >= len(r1)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.1,0.1,0.7,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=plt.cm.RdBu_r,vmin=-3,vmax=3)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 14)

# ADD DENDROGRAM
sch.set_link_color_palette(['#808080', '#808080', '#808080', '#808080','#808080','#808080','#808080'])
ax2 = fig.add_axes([0,0.1,0.08,0.6]) 
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#808080')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')

# ADD COLORBAR
axcolor = fig.add_axes([0.81,0.1,0.01,0.6])
cbar = plt.colorbar(im, cax=axcolor)
cbar.ax.get_yaxis().labelpad = 20
cbar.ax.tick_params(labelsize=10)
cbar.ax.set_ylabel('Median Expression per {}'.format(meta), rotation=270, fontsize = 10)

# SAVE FIGURE
figure_label = '_{}_matrixPlotMedian_markerGenes_by{}'.format(subset_type,meta) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_MYELOID_OTHER_ALL_matrixPlotMedian_markerGenes_byCELL_TYPE

Query marker expression by cell type (stacked violin plots of imputed data)

In [22]:
meta='CELL_TYPE' 
FLATUI_PLOT = FLATUI_CELL_TYPE_MYELOID_OTHER_ALL 

exec('QUERY = INDF_{}'.format(subset_type))

plot_order = ['EPITHELIAL','PROLIFERATING MESENCHYMAL PROGENITOR','FIBROBLAST','PERICYTE','ENDOTHELIAL',
              'MONOCYTE', 'NEUTROPHIL','MDSC','MACROPHAGE',
              'MICROGLIA/MACROPHAGE','DENDRITIC', 'DENDRITIC (ACTIVATED)','MAST']

genes = marker_genes

# GENERATE COLORMAP FOR ROWS (COLORED BY <meta>)
ind = QUERY.index.get_level_values(meta)
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))

# IMPUTED EXPRESSION PER GENE TAGGED BY METADATA in VIOLIN DATAFRAME
violin_data = pd.DataFrame(data = zscore(QUERY[genes],axis=0), columns = genes, index = QUERY.index)
violin_data = pd.DataFrame(data = violin_data.values, columns = violin_data.columns, index = violin_data.index.get_level_values(meta)).stack()
violin_data = pd.DataFrame({meta: violin_data.index.get_level_values(0), 
                            'gene':violin_data.index.get_level_values(1),
                            'Expression':violin_data.values})

# (0,0)  Plot violin plot
N = len(plot_order)

# GENERATE AXIS PER VIOLIN PLOT
fig = plt.figure(figsize = (8,N*1))
sns.set_style("white")

for iii,cell_type in enumerate(plot_order):
    ax1 = plt.subplot2grid((N, 1), (iii, 0), colspan=1)
    g = sns.violinplot(x="gene", y="Expression", data=violin_data[violin_data[meta]==cell_type], 
                       inner = 'quart',orient = 'v',bw = 0.15, linewidth = 0.1, 
                       scale = 'count',color = palette[cell_type],ax = ax1)
    g.set_ylabel(" ")
    g.set_xlabel(" ")
    g.tick_params(axis='y',labelsize=8, rotation = 0)
    plt.ylim(0,10)
    plt.title("{}".format(cell_type.title()),fontsize=8,fontname = 'Arial',loc = 'left',
              rotation = 'horizontal', ha = 'left', pad = 1.5)

    ax1.spines['top'].set_linewidth(0.5)
    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_linewidth(0.5)
    ax1.spines['bottom'].set_linewidth(0.5)
    ax1.spines['left'].set_linewidth(0.5)
    ax1.spines['right'].set_visible(False)
    
    if iii != (N-1):
        ax1.tick_params(axis='x', bottom=False, labelbottom=False)
    else:
        ax1.tick_params(axis='x', labelsize=8, rotation = 90)
        
# SAVE FIGURE
figure_label = '_{}_stackedViolin_markerGenes_by{}'.format(subset_type,meta) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_MYELOID_OTHER_ALL_stackedViolin_markerGenes_byCELL_TYPE

Query marker expression by cell type (stacked ridge plot of imputed data)

In [23]:
gene_name = 'CDH1'

exec('QUERY = INDF_{}'.format(subset_type))

plot_order = ['EPITHELIAL','PROLIFERATING MESENCHYMAL PROGENITOR','FIBROBLAST','PERICYTE','ENDOTHELIAL',
              'MONOCYTE', 'NEUTROPHIL','MDSC','MACROPHAGE',
              'MICROGLIA/MACROPHAGE','DENDRITIC', 'DENDRITIC (ACTIVATED)','MAST']

# GENERATE COLORMAP
ind = QUERY.index.get_level_values(meta)
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))

# IMPUTED EXPRESSION PER GENE TAGGED BY METADATA in VIOLIN DATAFRAME
violin_data = pd.DataFrame(data = zscore(QUERY,axis=0), columns = QUERY.columns, index = QUERY.index)
violin_data = pd.DataFrame(data = violin_data.values, columns = violin_data.columns, index = violin_data.index.get_level_values(meta)).stack()
violin_data = pd.DataFrame({meta: violin_data.index.get_level_values(0), 
                            'gene':violin_data.index.get_level_values(1),
                            'Expression':violin_data.values})


gene_data = violin_data[violin_data['gene']==gene_name]
row_order = list(gene_data.groupby(meta).median().sort_values(by=['Expression']).index)

sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})

# Initialize the FacetGrid object
pal = sns.cubehelix_palette(10, rot=-.25, light=.7)
g = sns.FacetGrid(gene_data, row=meta, 
                  hue = meta,aspect=10, size = 0.30,row_order = row_order,
                  palette=palette,col_wrap=None, sharex=True, sharey=True)

# Draw the densities in a few steps
g.map(sns.kdeplot, "Expression", clip_on=False, shade=True, alpha=1, lw=1, bw=.2, kernel = 'gau')
g.map(sns.kdeplot, "Expression", clip_on=False, color="w", lw=1, bw=.2, kernel = 'gau')
g.map(plt.axhline, y=0, lw=1, clip_on = False)

# Define and use a simple function to label the plot in axes coordinates
def label(x, color, label):
    ax = plt.gca()
    ax.text(0, 0.1, label,color=color,fontsize=8,
            ha="right", va="bottom", transform=ax.transAxes)
g.map(label, "Expression")

# Set the subplots to overlap
g.fig.subplots_adjust(hspace=-.75)

# Remove axes details that don't play well with overlap
g.set_titles("")
g.set(yticks=[])
g.despine(bottom=True, left=True)

plt.xlabel('KDE({} expression)'.format(gene_name.title()),fontsize=10)

# SAVE FIGURE
figure_label = '_{}_stackedKDE_{}_by{}'.format(subset_type,gene_name,meta) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_MYELOID_OTHER_ALL_stackedKDE_CDH1_byCELL_TYPE

Correlation of aggregate single cell data with bulk gene expression profiles

In [24]:
# DMAP (ALREADY LOG TRANSFORMED)
dmap = pd.read_table(os.path.expanduser(DATA_PATH+'DMap_data.gct'), 
                     skiprows=2, parse_dates=False, index_col=[0, 1]).T # cells x genes
dmap.columns = dmap.columns.droplevel(0)
dmap = dmap.groupby(level=0, axis=1).mean() # average over duplicate genes

# average over dmap replicates
dmap.index = [re.sub('_\d{1,2}', '', i) for i in dmap.index]
dmap = dmap.groupby(level=0, axis=0).mean()

# z-normalize
dmap = pd.DataFrame(data = zscore(dmap,axis=0), columns = dmap.columns, index = dmap.index)

# GARVAN (NOT YET LOG TRANSFORMED)
garvan = pd.read_csv(os.path.expanduser(DATA_PATH+'garvan_immune_nonorm.csv'), 
                     index_col=0)

# average over garvan replicates
garvan.columns = [re.sub('_\d', '', i) for i in garvan.columns]
garvan = garvan.groupby(level=0, axis=1).mean()# genes x cell types

# transpose and log transform data
garvan = np.log2(garvan + 1).T # cell types x genes

# z-normalize
garvan = pd.DataFrame(data = zscore(garvan,axis=0), columns = garvan.columns, index = garvan.index)


# Drop irrelevant cell types for myeloid/other compartment
dmap = dmap.drop(['BASO1', 'EOS2', 'ERY1', 'ERY2', 'ERY3', 'ERY4', 'ERY5', 'GMP', 'CMP',
                  'GRAN1', 'GRAN2', 'GRAN3', 'HSC1', 'HSC3', 'MEGA1', 'MEGA2', 'MEP',
                  'NKA1','NKA2','NKA3','NKA4',
                  'TCELLA2','TCELLA3','TCELLA4','TCELLA6','TCELLA7','TCELLA8','TCELLA1',
                  'PRE_BCELL2','PRE_BCELL3','BCELLA1','BCELLA2','BCELLA3','BCELLA4'])#
garvan = garvan.drop(['NK','bcells','central_memory_T_cells','effector_memory_T_cells',
                      'eosinophils_PMAh','eosinophils_control_rep2','th1_cells','basophils','eosinophils',
                      'th2_cells'])
In [25]:
#### IDENTIFY VARIABLY EXPRESSED GENES DETECTED IN > 10 CELL WITH HIGH DISPERSION
meta='CELL_TYPE' 
FLATUI_PLOT = FLATUI_CELL_TYPE_MYELOID_OTHER_ALL 
exec('QUERY = NDF_{}'.format(subset_type))

# FIT BINOMIAL DISTRIBUTION AND FILTER BASED ON MEAN/STD OF SECOND
plt.figure(figsize = (10,2))
gs1 = gridspec.GridSpec(1, 2)
gs1.update(wspace=0.25, hspace=0.7) # set the spacing between axes. 

# (1) PLOT LOG NUMBER OF CELLS CONTRIBUTING TO EACH GENE
num_cells_per_gene = np.log(np.sum(QUERY.values > 1,axis=0))
num_cells_per_gene[(np.isinf(num_cells_per_gene)) | (np.isnan(num_cells_per_gene))] = 0
keep_genes1 = pd.Series(data=num_cells_per_gene>=1,index = QUERY.columns) # GENES MUST BE DETECTED IN AT LEAST 10 CELLS

ax = plt.subplot(gs1[0])
bins = np.linspace(num_cells_per_gene.min(), num_cells_per_gene.max()*0.95, 20)
plt.hist(num_cells_per_gene, bins, alpha=0.5, label='keep')
plt.hist(num_cells_per_gene[keep_genes1], bins, alpha=1, label='retain',color = 'r')
    
ax.set_facecolor('white')
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('Gene Filter 1: Remove Genes Singletons \n(Log. # Expressing Cells)')
plt.grid(True)
sns.despine()

# (2) PLOT GENE DISPERSION
pseudocount = 0.001
QUERY = np.log10(QUERY+pseudocount)

x = QUERY.mean(axis=0, skipna=False)
y = QUERY.var(axis=0, skipna=False)
z = np.polyfit(x, y, 5)
p = np.poly1d(z)
yfit = p(x)

# The biological component for each gene is defined as the difference between its total variance and the fitted value of the trend
biological_variance = y-yfit
keep_genes2 = biological_variance>biological_variance.mean()

ax = plt.subplot(gs1[1])
plt.scatter(x,y, c = 'g',s=1)
plt.scatter(x,yfit,s = 1,c = 'k')
plt.scatter(x[keep_genes2],y[keep_genes2],c='r',s=1)
plt.xlabel('mean log10(expression)')
plt.ylabel('variance log10(expression)')

# SELECT VARIABLE GENES EXPRESSED IN > 10 CELLS
retain_bool = keep_genes1*keep_genes2==True
retain_goi = list(retain_bool[retain_bool].index)
print(len(retain_goi))

# IDENTIFY VARIABLE GENES ALSO DETECTED IN BULK MRNA DATA
expressed = list(set(retain_goi).intersection(dmap.columns).intersection(garvan.columns))
print(len(expressed))

# SELECT INTERSECTING SINGLE CELL VARIABLE/EXPRESSED GENES IN BULK DATASETS
exec('QUERY = np.log2(INDF_{}+1)'.format(subset_type))# LOG TRANSFORM FOR COMPARISON TO LOG TRANSFORMED BULK DATA
centroid_data = pd.DataFrame(data = zscore(QUERY[expressed].values,axis=0),columns = expressed, index = QUERY.index).groupby(level= meta, axis=0).median() #CLUSTERS
bulk_sigs = dmap[expressed].append(garvan[expressed])

# COLORMAP LUT BASED ON QUERY & MET
ind = QUERY.index.get_level_values(meta)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))

# SAVE FIGURE
figure_label = '_{}_GENE_FILTERING'.format(subset_type) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
5424
2117
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_MYELOID_OTHER_ALL_GENE_FILTERING
In [26]:
### CORRELATION BETWEEN CENTROIDS ####
rvalues = np.zeros([centroid_data.shape[0], bulk_sigs.shape[0]])
pvalues = np.zeros([centroid_data.shape[0], bulk_sigs.shape[0]])

for centroid_ind, centroid_ind_name in enumerate(centroid_data.index):
    for bulk_ind,val in enumerate(bulk_sigs.index):
        m = centroid_data.ix[centroid_ind]
        h = bulk_sigs.ix[val]
        # Linear Regression
        lr = linregress(m, h)
        slope = lr[0]
        intercept = lr[1]
        rvalue = lr[2]
        pvalue = lr[3]
        stderr = lr[4]
        rvalues[centroid_ind,bulk_ind] = rvalue
        pvalues[centroid_ind,bulk_ind] = pvalue

# Generate Dataframe
rvalues = pd.DataFrame(
    data=rvalues,
    columns=list(bulk_sigs.index),
    index=centroid_data.index)
rvalues.columns.name = 'Patient Single Cell Lung Epithelium'

pvalues = pd.DataFrame(
    data=pvalues,
    columns=list(bulk_sigs.index),
    index=centroid_data.index)
pvalues.columns.name = 'Patient Single Cell Lung Epithelium'

# ROW COLORS
ind = list(rvalues.index) # CLUSTERS
row_colors = pd.Series(ind,index = ind).map(lut)

# PLOT CORRELATION OF CLUSTERS WITH BULK RNA Sequencing
method = 'average'
metric = 'euclidean'
linkage = hc.linkage(rvalues, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(rvalues.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)

# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
cl = hc.leaves_list(col_linkage)
mat = rvalues.ix[r1,cl]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(3,round(rvalues.shape[0]/3)))

# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.125,0.1,0.035,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors[r1]:
    pos = (x, y / len(r1))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(r1), color=c))
    if y >= len(r1)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

minplot = 0.10

# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.16,0.1,0.7,0.6])
im = axmatrix.matshow(mat[(pvalues < 0.01)*(np.abs(mat)>minplot)], aspect='auto', 
                      origin='lower', cmap=plt.cm.RdBu_r,vmin=-0.5,vmax=0.5)
xlabels = list(mat.columns)
ylabels = list(mat.index)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.yaxis.set_ticks_position('right')
xtick = plt.xticks(range(len(xlabels)), xlabels, rotation = 90, fontsize = 10,fontname='Arial')
ytick = plt.yticks(range(len(ylabels)), ylabels, rotation = 0, fontsize = 10,fontname='Arial')
axmatrix.grid(False)
plt.setp(axmatrix.spines.values(), linewidth=0.5)

# ADD COLORBAR
axcolor = fig.add_axes([0.16,0.71,0.7,0.03])
cbar = plt.colorbar(im, cax=axcolor, orientation = 'horizontal')
cbar.ax.xaxis.set_ticks_position('top')
cbar.outline.set_visible(False)

# ADD DENDROGRAM
matplotlib.rcParams['lines.linewidth'] = 0.5
sch.set_link_color_palette(['#000000', '#000000', '#000000', '#000000','#000000','#000000','#000000'])
ax2 = fig.add_axes([0.045,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#000000')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')

# SAVE FIGURE
figure_label = '_{}_CORRELATION_{}_DBGAPandGARVAN_{}_{}'.format(subset_type,meta,method,metric) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_MYELOID_OTHER_ALL_CORRELATION_CELL_TYPE_DBGAPandGARVAN_average_euclidean

Lymphoid

In [27]:
# SPECIFY DATA SUBSET
subset_type = 'LYMPHOID_ALL'

Load data subset

In [28]:
# LOAD DATA

# RAW COUNTS (Cells x Genes)
exec('DF_{} = h5_data.load(\'/DF_{}\')'.format(subset_type,subset_type))

# MEDIAN LIBRARY-SIZE NORMALIZED COUNTS (Cells x Genes)
exec('NDF_{} = h5_data.load(\'/NDF_{}\')'.format(subset_type,subset_type))

# MAGIC IMPUTED AND NORMALIZED DATA (Cells x Genes)
exec('INDF_{} = h5_data.load(\'/INDF_{}\')'.format(subset_type,subset_type))

# CELLS X DIMENSIONS (including principal components, diffusion components, ect.)
exec('DIMENSIONS_{} = h5_data.load(\'/DIMENSIONS_{}\')'.format(subset_type,subset_type))

# RANKED DIFFERENTIALLY EXPRESSED GENES BY MAST (EACH CELL TYPE VS. ALL OTHER CELLS)
exec('GENE_RANK_MAST_{} = h5_data.load(\'/GENE_RANK_MAST_{}\')'.format(subset_type,subset_type))

Visualize Phenograph clusters and cell type assignments

In [29]:
subset_type = 'LYMPHOID_ALL'
dimtype = 'RLS_IPC_TSNE' # TSNE COMPUTED ON PRINCIPAL COMPONENTS(PCS) ON THE INDF AFTER REGRESSION OF LIBRARY SIZE FROM PCS
meta = 'CELL_TYPE'
exec('QUERY = INDF_{}'.format(subset_type))
exec('DIM = DIMENSIONS_{}'.format(subset_type))

x = DIM['{}0'.format(dimtype)]
y = DIM['{}1'.format(dimtype)]

# XY Parameters
plt.figure(figsize = (15,5))
gs1 = gridspec.GridSpec(1,3)
gs1.update(wspace=0.1, hspace=0.1) # set the spacing between axes. 
dot_size = 2

xmax = np.abs(x).max()
ymax = np.abs(y).max()
axis_min = -ceil(ceil(max(xmax,ymax)/10)*10)
axis_max = ceil(ceil(max(xmax,ymax)/10)*10)

# SAMPLE ID: LEGEND
ax1 = plt.subplot(gs1[0])
seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('Legend'), 
                              cmap=CM_SAMPLES,
                              legend_kwargs={'ncol': 1, 'borderaxespad':0.,
                                            'bbox_to_anchor':(1, 0)}, 
                              s=dot_size, ax=ax1);
sns.despine()
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
sns.despine()

# PHENOGRAPH CLUSTERS
ax2 = plt.subplot(gs1[1])
numeric_labels = [int(label[:-9]) for label in QUERY.index.get_level_values('PHENOGRAPH_CLASS')]
g = seqc.plot.scatter.categorical(x, y, c=numeric_labels, 
                                  cmap= CM_LYMPHOID_ALL_PHENOGRAPH_CLASS, 
                                  legend_kwargs={'ncol': 3, 'borderaxespad':0.,
                                            'bbox_to_anchor':(0.75, 0)},
                                  s=dot_size,randomize=True, ax = ax2);
ax2.tick_params(labelsize=14)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax2.axis('off')

# CELL TYPE ASSIGNMENTS
ax3 = plt.subplot(gs1[2])
g = seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('CELL_TYPE'), 
                                  cmap= CM_CELL_TYPE_LYMPHOID_ALL, 
                                  legend_kwargs={'ncol': 1, 'borderaxespad':0.,
                                            'bbox_to_anchor':(1, 0)},
                                  s=dot_size,randomize=True, ax = ax3);
h,l=g.get_legend_handles_labels()
ax3.tick_params(labelsize=14)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax3.axis('off')

# SAVE FIGURE
figure_label = '_{}_LEGEND_PHENOGRAPH_CELLTYPE_{}'.format(subset_type,dimtype)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_LYMPHOID_ALL_LEGEND_PHENOGRAPH_CELLTYPE_RLS_IPC_TSNE

Query marker expression by cell type (dot plot of un-imputed data)

In [30]:
# SPECIFY GROUPBY PARAMETER AND ASSOCIATED COLORMAP
meta='CELL_TYPE'
FLATUI_PLOT = FLATUI_CELL_TYPE_LYMPHOID_ALL

# SPECIFY DATA AND GROUP CELLS BY THIS PARAMETER
exec('QUERY = NDF_{}'.format(subset_type))
     
# RANK GROUPED CELLS BY CATEGORIES (ORDERED)
categories = ['Breg', 'IG', 'NK', 'NKT', 'Treg','Th', 'Tm']

marker_genes_dict = {
                     'B_CELL': ['CD79A','MS4A1'],
                     'PLASMA_CELL': ['IGHM','IGHA1','IGHG1','IGHG2','IGHG3','IGHG4'],
                     'T_CELL': ['CD2','CD3D','CD3E','CD3G','CD8A','CD8B','STAT4',
                               'CTLA4','IFNG','IL7R','CD40LG'],
                     'NK_CELL':['NKG7','GNLY','NCR3','CD16','KIR33DL1','KLRD1']

                    }
# SPECIFY ORDER OF PLOT GENES
var_group_labels = ['B_CELL','PLASMA_CELL','NK_CELL','T_CELL']

# CONVERT CATEGORIZED GENE DICTIONARY TO MARKER_GENES WITH POSITION ANNOTATED
tmp = pd.DataFrame()
for gene_group in var_group_labels:
    genes = marker_genes_dict[gene_group]
    genes = [gene for gene in genes if gene in QUERY.columns]
    labels = [gene_group] * len(genes)
    current = pd.DataFrame({'labels':labels,'genes':genes})
    tmp = tmp.append(current)
tmp.index = np.arange(len(tmp))
marker_genes = tmp['genes'].values

var_group_positions = []
for gene_group in var_group_labels:
    cpositions = list(tmp[tmp['labels'] ==gene_group].index)
    var_group_positions.append((cpositions[0],cpositions[len(cpositions)-1]))

# DOT PLOT OF GENES LABELED BY CATEGORY
ax = dotplot(QUERY[marker_genes], marker_genes, categories, groupby=meta, use_raw=None, log=False,
            expression_cutoff=0, mean_only_expressed=True, color_map='Reds', dot_max=0.75,
            dot_min=0.10, figsize=None, dendrogram=False, gene_symbols=None,
            var_group_positions=var_group_positions, standard_scale = 'var', smallest_dot=0,
            var_group_labels=var_group_labels, var_group_rotation=90, layer=None, show=None,
            save=None)

# SAVE FIGURE
figure_label = '_{}_dotPlot_markerGenes_by{}'.format(subset_type,meta) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_LYMPHOID_ALL_dotPlot_markerGenes_byCELL_TYPE

Query marker expression by cell type (matrix plot of imputed data)

In [31]:
# SPECIFY GROUPBY PARAMETER AND ASSOCIATED COLORMAP
meta= 'CELL_TYPE'
FLATUI_PLOT = FLATUI_CELL_TYPE_LYMPHOID_ALL 

# ROW INDEX
genes = marker_genes
exec('QUERY = INDF_{}'.format(subset_type))

# CONVERT HEX TO RGB
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)

# SELECT UNIQUE, DE GENES
genes,ind = np.unique(genes, return_index=True)

# CONSTRUCT HEATMAP DATA
genes = [g for g in genes if g in list(QUERY.columns)] # Detected genes
heatmap_data = pd.DataFrame(data = zscore(QUERY[genes].values,axis=0),columns = genes, index = QUERY.index).groupby(level= meta, axis=0).median() #CLUSTERS
if meta=='PHENOGRAPH_CLASS':
    new_ind = [int(name[:-9]) for name in heatmap_data.index]
    heatmap_data = pd.DataFrame(data = heatmap_data.values, index = new_ind, columns = heatmap_data.columns)

yticks = heatmap_data.index
xticks = heatmap_data.columns

# Palatte for Class METADATA
ind = list(heatmap_data.index) # CLUSTERS
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors = pd.Series(ind,index = ind).map(lut)

# LINKAGE 
method = 'average' 
metric = 'correlation'
linkage = hc.linkage(heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)

# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
cl = hc.leaves_list(col_linkage)
mat = heatmap_data.iloc[r1,cl]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(10,5))
plt.rcParams["axes.grid"] = False
import matplotlib.patches as patches

# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.080,0.1,0.02,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors[list(mat.index)]:
    pos = (x, y / len(mat.index))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(mat.index), color=c))
    if y >= len(r1)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.1,0.1,0.7,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=plt.cm.RdBu_r,vmin=-3,vmax=3)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 14)

# ADD DENDROGRAM
sch.set_link_color_palette(['#808080', '#808080', '#808080', '#808080','#808080','#808080','#808080'])
ax2 = fig.add_axes([0,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#808080')
plt.axis('off')

# ADD COLORBAR
axcolor = fig.add_axes([0.81,0.1,0.01,0.6])
cbar = plt.colorbar(im, cax=axcolor)
cbar.ax.get_yaxis().labelpad = 20
cbar.ax.tick_params(labelsize=10)
cbar.ax.set_ylabel('Median Expression per {}'.format(meta), rotation=270, fontsize = 10)

# SAVE FIGURE
figure_label = '_{}_matrixPlotMedian_markerGenes_by{}'.format(subset_type,meta) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_LYMPHOID_ALL_matrixPlotMedian_markerGenes_byCELL_TYPE

Query marker expression by cell type (stacked violin plots of imputed data)

In [32]:
meta='CELL_TYPE' 
FLATUI_PLOT = FLATUI_CELL_TYPE_LYMPHOID_ALL 

exec('QUERY = INDF_{}'.format(subset_type))

plot_order = ['Breg', 'IG', 'NK', 'NKT', 'Treg','Th', 'Tm']
genes = marker_genes

# GENERATE COLORMAP
# Palatte for Class METADATA
ind = QUERY.index.get_level_values(meta)
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))

# IMPUTED EXPRESSION PER GENE TAGGED BY METADATA in VIOLIN DATAFRAME
violin_data = pd.DataFrame(data = zscore(QUERY[genes],axis=0), columns = genes, index = QUERY.index)
violin_data = pd.DataFrame(data = violin_data.values, columns = violin_data.columns, index = violin_data.index.get_level_values(meta)).stack()
violin_data = pd.DataFrame({meta: violin_data.index.get_level_values(0), 
                            'gene':violin_data.index.get_level_values(1),
                            'Expression':violin_data.values})

# (0,0)  Plot violin plot
N = len(plot_order)

# GENERATE AXIS PER VIOLIN PLOT
fig = plt.figure(figsize = (3,N*1))
sns.set_style("white")

for iii,cell_type in enumerate(plot_order):
    ax1 = plt.subplot2grid((N, 1), (iii, 0), colspan=1)
    g = sns.violinplot(x="gene", y="Expression", data=violin_data[violin_data[meta]==cell_type], 
                       inner = 'quart',orient = 'v',bw = 0.15, linewidth = 0.1, 
                       scale = 'count',color = palette[cell_type],ax = ax1)
    g.set_ylabel(" ")
    g.set_xlabel(" ")
    g.tick_params(axis='y',labelsize=8, rotation = 0)
    plt.ylim(0,10)
    plt.title("{}".format(cell_type.title()),fontsize=8,fontname = 'Arial',loc = 'left',
              rotation = 'horizontal', ha = 'left', pad = 1.5)

    ax1.spines['top'].set_linewidth(0.5)
    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_linewidth(0.5)
    ax1.spines['bottom'].set_linewidth(0.5)
    ax1.spines['left'].set_linewidth(0.5)
    ax1.spines['right'].set_visible(False)
    
    if iii != (N-1):
        ax1.tick_params(axis='x', bottom=False, labelbottom=False)
    else:
        ax1.tick_params(axis='x', labelsize=8, rotation = 90)
        
# SAVE FIGURE
figure_label = '_{}_stackedViolin_markerGenes_by{}'.format(subset_type,meta) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_LYMPHOID_ALL_stackedViolin_markerGenes_byCELL_TYPE

Query marker expression by cell type (stacked ridge plot of imputed data)

In [33]:
gene_name = 'NKG7'
exec('QUERY = INDF_{}'.format(subset_type))

meta='CELL_TYPE' 
FLATUI_PLOT = FLATUI_CELL_TYPE_LYMPHOID_ALL 

# COLOR ROWS BY <meta>
ind = QUERY.index.get_level_values(meta)
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))

# IMPUTED EXPRESSION PER GENE TAGGED BY METADATA in VIOLIN DATAFRAME
violin_data = pd.DataFrame(data = zscore(QUERY,axis=0), columns = QUERY.columns, index = QUERY.index)
violin_data = pd.DataFrame(data = violin_data.values, columns = violin_data.columns, index = violin_data.index.get_level_values(meta)).stack()
violin_data = pd.DataFrame({meta: violin_data.index.get_level_values(0), 
                            'gene':violin_data.index.get_level_values(1),
                            'Expression':violin_data.values})


gene_data = violin_data[violin_data['gene']==gene_name]
row_order = list(gene_data.groupby(meta).median().sort_values(by=['Expression']).index)

sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})

# Initialize the FacetGrid object
pal = sns.cubehelix_palette(10, rot=-.25, light=.7)
g = sns.FacetGrid(gene_data, row=meta, 
                  hue = meta,aspect=10, size = 0.30,row_order = row_order,
                  palette=palette,col_wrap=None, sharex=True, sharey=True)

# Draw the densities in a few steps
g.map(sns.kdeplot, "Expression", clip_on=False, shade=True, alpha=1, lw=1, bw=.2, kernel = 'gau')
g.map(sns.kdeplot, "Expression", clip_on=False, color="w", lw=1, bw=.2, kernel = 'gau')
g.map(plt.axhline, y=0, lw=1, clip_on = False)

# Define and use a simple function to label the plot in axes coordinates
def label(x, color, label):
    ax = plt.gca()
    ax.text(0, 0.1, label,color=color,fontsize=8,
            ha="right", va="bottom", transform=ax.transAxes)
g.map(label, "Expression")

# Set the subplots to overlap
g.fig.subplots_adjust(hspace=-.75)

# Remove axes details that don't play well with overlap
g.set_titles("")
g.set(yticks=[])
g.despine(bottom=True, left=True)

plt.xlabel('KDE({} expression)'.format(gene_name.title()),fontsize=10)

# SAVE FIGURE
figure_label = '_{}_stackedKDE_{}_by{}'.format(subset_type,gene_name,meta) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_LYMPHOID_ALL_stackedKDE_NKG7_byCELL_TYPE

Correlation of aggregate single cell data with bulk gene expression profiles

In [34]:
#### LOAD BULK MRNA DATA ####

# DMAP (ALREADY LOG TRANSFORMED)
dmap = pd.read_table(os.path.expanduser(DATA_PATH+'DMap_data.gct'), 
                     skiprows=2, parse_dates=False, index_col=[0, 1]).T # cells x genes
dmap.columns = dmap.columns.droplevel(0)
dmap = dmap.groupby(level=0, axis=1).mean() # average over duplicate genes

# average over dmap replicates
dmap.index = [re.sub('_\d{1,2}', '', i) for i in dmap.index]
dmap = dmap.groupby(level=0, axis=0).mean()

# z-normalize
dmap = pd.DataFrame(data = zscore(dmap,axis=0), columns = dmap.columns, index = dmap.index)

# GARVAN (NOT YET LOG TRANSFORMED)
garvan = pd.read_csv(os.path.expanduser(DATA_PATH+'garvan_immune_nonorm.csv'), 
                     index_col=0)

# average over garvan replicates
garvan.columns = [re.sub('_\d', '', i) for i in garvan.columns]
garvan = garvan.groupby(level=0, axis=1).mean()# genes x cell types

# transpose and log transform data
garvan = np.log2(garvan + 1).T # cell types x genes

# z-normalize
garvan = pd.DataFrame(data = zscore(garvan,axis=0), columns = garvan.columns, index = garvan.index)


# Drop irrelevant cell types for myeloid/other compartment
dmap = dmap.drop(['BASO1', 'EOS2', 'ERY1', 'ERY2', 'ERY3', 
                  'ERY4', 'ERY5', 'GMP', 'GRAN1', 'GRAN2', 
                  'GRAN3', 'HSC1', 'HSC3', 'MEGA1', 'MEGA2', 
                  'MEP', 'PRE_BCELL2', 'PRE_BCELL2', 
                  'PRE_BCELL3','CMP','DENDA1','DENDA2',
                 'GMP','HSC1','HSC3','MONO1','MONO2'], axis=0)
garvan = garvan.drop(['basophils','dc', 'dc_LPS8h','eosinophils', 'eosinophils_PMAh',
       'eosinophils_control_rep2', 'macrophages', 'macrophages_LPSh',
       'mast_cells', 'mast_cells_IgEh', 'neutrophils', 'neutrophils_LPSh'], axis=0)
In [35]:
#### IDENTIFY VARIABLY EXPRESSED GENES DETECTED IN > 10 CELL (SINGLE CELL UN-IMPUTED DATA) ####
meta= 'CELL_TYPE'
FLATUI_PLOT =FLATUI_CELL_TYPE_LYMPHOID_ALL
exec('QUERY = NDF_{}'.format(subset_type))

# FIT BINOMIAL DISTRIBUTION AND FILTER BASED ON MEAN/STD OF SECOND
plt.figure(figsize = (10,2))
gs1 = gridspec.GridSpec(1, 2)
gs1.update(wspace=0.25, hspace=0.7) # set the spacing between axes. 

# (1) PLOT LOG NUMBER OF CELLS CONTRIBUTING TO EACH GENE
num_cells_per_gene = np.log(np.sum(QUERY.values > 1,axis=0))
num_cells_per_gene[(np.isinf(num_cells_per_gene)) | (np.isnan(num_cells_per_gene))] = 0
keep_genes1 = pd.Series(data=num_cells_per_gene>=1,index = QUERY.columns) # GENES MUST BE DETECTED IN AT LEAST 10 CELLS

ax = plt.subplot(gs1[0])
bins = np.linspace(num_cells_per_gene.min(), num_cells_per_gene.max()*0.95, 20)
plt.hist(num_cells_per_gene, bins, alpha=0.5, label='keep')
plt.hist(num_cells_per_gene[keep_genes1], bins, alpha=1, label='retain',color = 'r')
    
ax.set_facecolor('white')
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('Gene Filter 1: Remove Genes Singletons \n(Log. # Expressing Cells)')
plt.grid(True)
sns.despine()

# (2) PLOT GENE DISPERSION
pseudocount = 0.001
QUERY = np.log10(QUERY+pseudocount)

x = QUERY.mean(axis=0, skipna=False)
y = QUERY.var(axis=0, skipna=False)
z = np.polyfit(x, y, 5)
p = np.poly1d(z)
yfit = p(x)

# The biological component for each gene is defined as the difference between its total variance and the fitted value of the trend
biological_variance = y-yfit
keep_genes2 = biological_variance>(biological_variance.mean()+0*biological_variance.std())

ax = plt.subplot(gs1[1])
plt.scatter(x,y, c = 'g',s=1)
plt.scatter(x,yfit,s = 1,c = 'k')
plt.scatter(x[keep_genes2],y[keep_genes2],c='r',s=1)
plt.xlabel('mean log10(expression)')
plt.ylabel('variance log10(expression)')

# SELECT VARIABLE GENES EXPRESSED IN > 10 CELLS
retain_bool = keep_genes1*keep_genes2==True
retain_goi = list(retain_bool[retain_bool].index)
print(len(retain_goi))

# IDENTIFY VARIABLE GENES ALSO DETECTED IN BULK MRNA DATA
expressed = list(set(retain_goi).intersection(dmap.columns).intersection(garvan.columns))
print(len(expressed))

# SELECT INTERSECTING SINGLE CELL VARIABLE/EXPRESSED GENES IN BULK DATASETS
exec('QUERY = np.log2(INDF_{}+1)'.format(subset_type))# LOG TRANSFORM FOR COMPARISON TO LOG TRANSFORMED BULK DATA
centroid_data = pd.DataFrame(data = zscore(QUERY[expressed].values,axis=0),columns = expressed, index = QUERY.index).groupby(level= meta, axis=0).median() #CLUSTERS
bulk_sigs = dmap[expressed].append(garvan[expressed])

# COLORMAP LUT BASED ON QUERY & MET
ind = centroid_data.index
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))

# SAVE FIGURE
figure_label = '_{}_GENE_FILTERING'.format(subset_type) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
6565
3362
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_LYMPHOID_ALL_GENE_FILTERING
In [36]:
### CORRELATION BETWEEN CENTROIDS ####
rvalues = np.zeros([centroid_data.shape[0], bulk_sigs.shape[0]])
pvalues = np.zeros([centroid_data.shape[0], bulk_sigs.shape[0]])

for centroid_ind, centroid_ind_name in enumerate(centroid_data.index):
    for bulk_ind,val in enumerate(bulk_sigs.index):
        m = centroid_data.ix[centroid_ind]
        h = bulk_sigs.ix[val]
        # Linear Regression
        lr = linregress(m, h)
        slope = lr[0]
        intercept = lr[1]
        rvalue = lr[2]
        pvalue = lr[3]
        stderr = lr[4]
        rvalues[centroid_ind,bulk_ind] = rvalue
        pvalues[centroid_ind,bulk_ind] = pvalue

# Generate Dataframe
rvalues = pd.DataFrame(
    data=rvalues,
    columns=list(bulk_sigs.index),
    index=centroid_data.index)
rvalues.columns.name = 'Patient Single Cell Lung Epithelium'

pvalues = pd.DataFrame(
    data=pvalues,
    columns=list(bulk_sigs.index),
    index=centroid_data.index)

pvalues.columns.name = 'Patient Single Cell Lung Epithelium'

# ROW COLORS
ind = list(rvalues.index) # CLUSTERS
row_colors = pd.Series(ind,index = ind).map(lut)

# PLOT CORRELATION OF CLUSTERS WITH BULK RNA Sequencing
method = 'average'
metric = 'euclidean'
linkage = hc.linkage(rvalues, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(rvalues.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)

# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
cl = hc.leaves_list(col_linkage)
mat = rvalues.ix[r1,cl]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(5,round(rvalues.shape[0]/3)))

# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.125,0.1,0.035,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors[r1]:
    pos = (x, y / len(r1))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(r1), color=c))
    if y >= len(r1)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.16,0.1,0.7,0.6])
im = axmatrix.matshow(mat[pvalues < 0.01], aspect='auto', 
                      origin='lower', cmap=plt.cm.RdBu_r,vmin=-0.5,vmax=0.5)
xlabels = list(mat.columns)
ylabels = list(mat.index)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.yaxis.set_ticks_position('right')
xtick = plt.xticks(range(len(xlabels)), xlabels, rotation = 90, fontsize = 10,fontname='Arial')
ytick = plt.yticks(range(len(ylabels)), ylabels, rotation = 0, fontsize = 10,fontname='Arial')
axmatrix.grid(False)
plt.setp(axmatrix.spines.values(), linewidth=0.5)

# ADD COLORBAR
axcolor = fig.add_axes([0.16,0.71,0.7,0.03])
cbar = plt.colorbar(im, cax=axcolor, orientation = 'horizontal')
cbar.ax.xaxis.set_ticks_position('top')
cbar.outline.set_visible(False)

# ADD DENDROGRAM
matplotlib.rcParams['lines.linewidth'] = 0.5
sch.set_link_color_palette(['#000000', '#000000', '#000000', '#000000','#000000','#000000','#000000'])
ax2 = fig.add_axes([0.045,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#000000')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')

# SAVE FIGURE
figure_label = '_{}_CORRELATION_{}_DBGAPandGARVAN_{}_{}'.format(subset_type,meta,method,metric) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_LYMPHOID_ALL_CORRELATION_CELL_TYPE_DBGAPandGARVAN_average_euclidean

Annotated cell types mapped back to merged dataset

Cell type assignments were mapped directly back to the global dataset, using clusters annotated in MYELOID/OTHER and LYMPHOID analysis above. The index, 'PHENOGRAPH_CLASS' now also includes the phenograph class ID generated within each of the respective data subsets.

GENE_RANK_MAST for the 'ALL' subset was run for each cell type vs. all other cells in this complete merged atlas.

Visualize cell type assignments

In [37]:
# SPECIFY SUBSET
subset_type = 'ALL'
dimtype = 'IPC_TSNE' 
dot_size = 1
exec('DIM = DIMENSIONS_{}'.format(subset_type))

# XY Parameters
plt.figure(figsize = (15,5))
gs1 = gridspec.GridSpec(1,3)
gs1.update(wspace=0.1, hspace=0.1) # set the spacing between axes. 
dot_size = 1

x = DIM['{}0'.format(dimtype)]
y = DIM['{}1'.format(dimtype)]
xmax = np.abs(x).max()
ymax = np.abs(y).max()
axis_min = -ceil(ceil(max(xmax,ymax)/10)*10)
axis_max = ceil(ceil(max(xmax,ymax)/10)*10)

# SAMPLE ID
ax = plt.subplot(gs1[0])
seqc.plot.scatter.categorical(x, y, c=DF_ALL.index.get_level_values('Legend'), 
                              cmap=CM_SAMPLES,
                              legend_kwargs={'ncol': 1, 'borderaxespad':0.,
                                            'bbox_to_anchor':(1, 0)},
                              s=dot_size, ax=ax);
sns.despine()
plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
plt.title('Sample ID', fontname='Helvetica', size=12, weight='normal')
plt.axis('off')
ax.set_facecolor('white')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)

# METACELL TYPE ASSIGNMENT
ax = plt.subplot(gs1[1])
seqc.plot.scatter.categorical(x, y, c=DF_ALL.index.get_level_values('META_CELL_TYPE'), 
                              cmap=CM_METACELL,
                              legend_kwargs={'ncol': 1, 'borderaxespad':0.,
                                            'bbox_to_anchor':(1, 0)},
                              s=dot_size, ax=ax);
sns.despine()
plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
plt.title('META_CELL_TYPE', fontname='Helvetica', size=12, weight='normal')
plt.axis('off')
ax.set_facecolor('white')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)

# SOURCE: NORMAL VS. PRIMARY TUMOR VS. METASTASIS
ax = plt.subplot(gs1[2])
seqc.plot.scatter.categorical(x, y, c=DF_ALL.index.get_level_values('CELL_TYPE'), 
                              cmap= CM_CELL_TYPE,
                              legend_kwargs={'ncol': 1, 'borderaxespad':0.,
                                            'bbox_to_anchor':(1, 0)}, 
                              s=dot_size, ax=ax,randomize=True);
plt.rc('xtick', labelsize=6)
plt.rc('ytick', labelsize=6)
sns.despine()
plt.xlabel('tSNE-1', fontname='Helvetica', size=10, weight='normal')
plt.ylabel('tSNE-2', fontname='Helvetica', size=10, weight='normal')
plt.title('SOURCE', fontname='Helvetica', size=12, weight='normal')
plt.axis('off')
ax.set_facecolor('white')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)

figure_label = '_{}_LEGEND_METACELLTYPE_CELLTYPE_{}'.format(subset_type,dimtype)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_ALL_LEGEND_METACELLTYPE_CELLTYPE_IPC_TSNE

Query marker expression by cell type (dot plot of un-imputed data)

In [38]:
# SPECIFY GROUPBY PARAMETER AND ASSOCIATED COLORMAP
meta='CELL_TYPE'
FLATUI_PLOT = FLATUI_CELL_TYPE

# SPECIFY DATA AND GROUP CELLS BY THIS PARAMETER
exec('QUERY = NDF_{}'.format(subset_type))
     
# RANK GROUPED CELLS BY CATEGORIES (ORDERED)
categories = ['EPITHELIAL','PROLIFERATING MESENCHYMAL PROGENITOR','FIBROBLAST','PERICYTE','ENDOTHELIAL',
              'MONOCYTE', 'NEUTROPHIL','MDSC','MACROPHAGE',
              'MICROGLIA/MACROPHAGE','DENDRITIC', 'DENDRITIC (ACTIVATED)','MAST',
              'Breg', 'IG', 'NK', 'NKT', 'Treg','Th', 'Tm']

marker_genes_dict = {
                     'EPITHELIAL': ['CDH1','SOX9','SOX2'],
                     'STROMA': ['VIM', 'ACTA2','DES','PDGFRA','PDGFRB','CDH5','PECAM1','CD34'],
                     'MONOCYTE': ['FCGR3A','ITGAX','CD14'],
                     'DENDRITIC':['CD40','CD80','HLA-DOA', 'HLA-DOB','HLA-DQA1','HLA-DQA2','HLA-DQB1'],
                     'MHC2': ['HLA-DPA1','HLA-DPB1', 'HLA-DQA1','HLA-DQA2','HLA-DQB1', 'HLA-DRA','HLA-DRB1'],
                     'MACROPHAGE': ['LAMP2','LGALS3','MRC1','TFRC','FCGR2A','ITGAM','CD163'],
                     'NEUTROPHIL': ['LY6G6D','S100A9','S100A8','S100A12','PILRA','LILRB2','ISG15'],
                     'MDSC':['CYBB','CSF1R','STAT1','CD14','FCGR1A','FCGR1B','CD33','CCR1'],
                     'MAST':['KIT'], 
                     'B_CELL': ['CD79A','MS4A1'],
                     'PLASMA_CELL': ['IGHM','IGHA1','IGHG1','IGHG2','IGHG3','IGHG4'],
                     'T_CELL': ['CD2','CD3D','CD3E','CD3G','CD8A','CD8B','STAT4',
                               'CTLA4','IFNG','IL7R','CD40LG'],
                     'NK_CELL':['NKG7','GNLY','NCR3','CD16','KIR33DL1','KLRD1'],
                     'IMMUNE':['PTPRC']
                    }
# SPECIFY ORDER OF PLOT GENES
var_group_labels = ['EPITHELIAL','STROMA','MONOCYTE','NEUTROPHIL','MDSC','MACROPHAGE','DENDRITIC','MAST','B_CELL','PLASMA_CELL','NK_CELL','T_CELL','IMMUNE']

# CONVERT CATEGORIZED GENE DICTIONARY TO MARKER_GENES WITH POSITION ANNOTATED
tmp = pd.DataFrame()
for gene_group in var_group_labels:
    genes = marker_genes_dict[gene_group]
    genes = [gene for gene in genes if gene in QUERY.columns]
    labels = [gene_group] * len(genes)
    current = pd.DataFrame({'labels':labels,'genes':genes})
    tmp = tmp.append(current)
tmp.index = np.arange(len(tmp))
marker_genes = tmp['genes'].values

var_group_positions = []
for gene_group in var_group_labels:
    cpositions = list(tmp[tmp['labels'] ==gene_group].index)
    var_group_positions.append((cpositions[0],cpositions[len(cpositions)-1]))

# DOT PLOT OF GENES LABELED BY CATEGORY
fig = plt.figure(figsize=(5,10))
dotplot(QUERY[marker_genes], marker_genes, categories, groupby=meta, use_raw=None, log=False,
        expression_cutoff=0, mean_only_expressed=True, color_map='Reds', dot_max=0.75,
        dot_min=0.10, figsize=None, dendrogram=False, gene_symbols=None,
        var_group_positions=var_group_positions, standard_scale = 'var', smallest_dot=0,
        var_group_labels=var_group_labels, var_group_rotation=90, layer=None, show=None,
        save=None)

# COLORMAP LUT BASED ON QUERY & MET
ind = categories #QUERY.index.get_level_values(meta)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors = pd.Series(ind,index = ind).map(lut)

# SAVE FIGURE
figure_label = '_{}_dotPlot_markerGenes_by{}'.format(subset_type,meta) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_ALL_dotPlot_markerGenes_byCELL_TYPE
<Figure size 360x720 with 0 Axes>

Query marker expression by cell type (matrix plot of imputed data)

In [39]:
# SPECIFY GROUPBY PARAMETER AND ASSOCIATED COLORMAP
meta='CELL_TYPE' 
FLATUI_PLOT = FLATUI_CELL_TYPE

# ROW INDEX
genes = marker_genes
exec('QUERY = INDF_{}'.format(subset_type))

# CONVERT HEX TO RGB
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)

# SELECT UNIQUE, DE GENES
genes,ind = np.unique(genes, return_index=True)

# CONSTRUCT HEATMAP DATA
genes = [g for g in genes if g in list(QUERY.columns)] # Detected genes
heatmap_data = pd.DataFrame(data = zscore(QUERY[genes].values,axis=0),columns = genes, index = QUERY.index).groupby(level= meta, axis=0).median() #CLUSTERS

yticks = heatmap_data.index
xticks = heatmap_data.columns

# Palatte for Class METADATA
ind = list(heatmap_data.index) # CLUSTERS
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors = pd.Series(ind,index = ind).map(lut)

# LINKAGE 
method = 'average' # average, single
metric = 'correlation' # cosine
linkage = hc.linkage(heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)

# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
cl = hc.leaves_list(col_linkage)
mat = heatmap_data.iloc[r1,cl]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(15,5))
plt.rcParams["axes.grid"] = False
import matplotlib.patches as patches

# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.080,0.1,0.02,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors[list(mat.index)]:
    pos = (x, y / len(mat.index))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(mat.index), color=c))
    if y >= len(r1)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.1,0.1,0.7,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=plt.cm.RdBu_r,vmin=-3,vmax=3)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 14)

# ADD DENDROGRAM
sch.set_link_color_palette(['#808080', '#808080', '#808080', '#808080','#808080','#808080','#808080'])
ax2 = fig.add_axes([0,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#808080')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')

# ADD COLORBAR
axcolor = fig.add_axes([0.81,0.1,0.01,0.6])
cbar = plt.colorbar(im, cax=axcolor)
cbar.ax.get_yaxis().labelpad = 20
cbar.ax.tick_params(labelsize=10)
cbar.ax.set_ylabel('Median Expression per {}'.format(meta), rotation=270, fontsize = 10)

# SAVE FIGURE
figure_label = '_{}_matrixPlo`btMedian_markerGenes_by{}'.format(subset_type,meta) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_ALL_matrixPlo`btMedian_markerGenes_byCELL_TYPE
In [40]:
# SAVE GENE ORDER FOR STACKED VIOLIN PLOT
global_marker_gene_order = mat.columns

Correlation of aggregate single cell data with bulk gene expression profiles

In [41]:
#### LOAD BULK MRNA DATA ####

# DMAP (ALREADY LOG TRANSFORMED)
dmap = pd.read_table(os.path.expanduser(DATA_PATH+'DMap_data.gct'), 
                     skiprows=2, parse_dates=False, index_col=[0, 1]).T # cells x genes
dmap.columns = dmap.columns.droplevel(0)
dmap = dmap.groupby(level=0, axis=1).mean() # average over duplicate genes

# average over dmap replicates
dmap.index = [re.sub('_\d{1,2}', '', i) for i in dmap.index]
dmap = dmap.groupby(level=0, axis=0).mean()

# z-normalize
dmap = pd.DataFrame(data = zscore(dmap,axis=0), columns = dmap.columns, index = dmap.index)

# GARVAN (NOT YET LOG TRANSFORMED)
garvan = pd.read_csv(os.path.expanduser(DATA_PATH+'garvan_immune_nonorm.csv'), 
                     index_col=0)

# average over garvan replicates
garvan.columns = [re.sub('_\d', '', i) for i in garvan.columns]
garvan = garvan.groupby(level=0, axis=1).mean()# genes x cell types

# transpose and log transform data
garvan = np.log2(garvan + 1).T # cell types x genes

# z-normalize
garvan = pd.DataFrame(data = zscore(garvan,axis=0), columns = garvan.columns, index = garvan.index)


# Drop irrelevant cell types for myeloid/other compartment
dmap = dmap.drop(['BASO1', 'EOS2', 'ERY1', 'ERY2', 'ERY3', 
                  'ERY4', 'ERY5', 'GMP', 'GRAN1', 'GRAN2', 
                  'GRAN3', 'HSC1', 'HSC3', 'MEGA1', 'MEGA2', 
                  'MEP', 'PRE_BCELL2', 'PRE_BCELL2', 
                  'PRE_BCELL3','CMP','GMP','HSC1','HSC3'], axis=0)
garvan = garvan.drop(['basophils','eosinophils', 'eosinophils_PMAh',
       'eosinophils_control_rep2'], axis=0)
In [42]:
#### IDENTIFY VARIABLY EXPRESSED GENES DETECTED IN > 10 CELL (SINGLE CELL UN-IMPUTED DATA) ####
meta= 'CELL_TYPE'
FLATUI_PLOT =FLATUI_CELL_TYPE
exec('QUERY = NDF_{}'.format(subset_type))

# FIT BINOMIAL DISTRIBUTION AND FILTER BASED ON MEAN/STD OF SECOND
plt.figure(figsize = (10,2))
gs1 = gridspec.GridSpec(1, 2)
gs1.update(wspace=0.25, hspace=0.7) # set the spacing between axes. 

# (1) PLOT LOG NUMBER OF CELLS CONTRIBUTING TO EACH GENE
num_cells_per_gene = np.log(np.sum(QUERY.values > 1,axis=0))
num_cells_per_gene[(np.isinf(num_cells_per_gene)) | (np.isnan(num_cells_per_gene))] = 0
keep_genes1 = pd.Series(data=num_cells_per_gene>=1,index = QUERY.columns) # GENES MUST BE DETECTED IN AT LEAST 10 CELLS

ax = plt.subplot(gs1[0])
bins = np.linspace(num_cells_per_gene.min(), num_cells_per_gene.max()*0.95, 20)
plt.hist(num_cells_per_gene, bins, alpha=0.5, label='keep')
plt.hist(num_cells_per_gene[keep_genes1], bins, alpha=1, label='retain',color = 'r')
    
ax.set_facecolor('white')
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('Gene Filter 1: Remove Genes Singletons \n(Log. # Expressing Cells)')
plt.grid(True)
sns.despine()

# (2) PLOT GENE DISPERSION
pseudocount = 0.001
QUERY = np.log10(QUERY+pseudocount)

x = QUERY.mean(axis=0, skipna=False)
y = QUERY.var(axis=0, skipna=False)
z = np.polyfit(x, y, 5)
p = np.poly1d(z)
yfit = p(x)

# The biological component for each gene is defined as the difference between its total variance and the fitted value of the trend
biological_variance = y-yfit
keep_genes2 = biological_variance>(biological_variance.mean()+0*biological_variance.std())

ax = plt.subplot(gs1[1])
plt.scatter(x,y, c = 'g',s=1)
plt.scatter(x,yfit,s = 1,c = 'k')
plt.scatter(x[keep_genes2],y[keep_genes2],c='r',s=1)
plt.xlabel('mean log10(expression)')
plt.ylabel('variance log10(expression)')

# SELECT VARIABLE GENES EXPRESSED IN > 10 CELLS
retain_bool = keep_genes1*keep_genes2==True
retain_goi = list(retain_bool[retain_bool].index)
print(len(retain_goi))

# IDENTIFY VARIABLE GENES ALSO DETECTED IN BULK MRNA DATA
expressed = list(set(retain_goi).intersection(dmap.columns).intersection(garvan.columns))
print(len(expressed))

# SELECT INTERSECTING SINGLE CELL VARIABLE/EXPRESSED GENES IN BULK DATASETS
exec('QUERY = np.log2(INDF_{}+1)'.format(subset_type))# LOG TRANSFORM FOR COMPARISON TO LOG TRANSFORMED BULK DATA
centroid_data = pd.DataFrame(data = zscore(QUERY[expressed].values,axis=0),columns = expressed, index = QUERY.index).groupby(level= meta, axis=0).median() #CLUSTERS
bulk_sigs = dmap[expressed].append(garvan[expressed])

# COLORMAP LUT BASED ON QUERY & MET
ind = QUERY.index.get_level_values(meta)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))

# SAVE FIGURE
figure_label = '_{}_GENE_FILTERING'.format(subset_type) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
6245
2576
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_ALL_GENE_FILTERING
In [43]:
### CORRELATION BETWEEN CENTROIDS ####
rvalues = np.zeros([centroid_data.shape[0], bulk_sigs.shape[0]])
pvalues = np.zeros([centroid_data.shape[0], bulk_sigs.shape[0]])

for centroid_ind, centroid_ind_name in enumerate(centroid_data.index):
    for bulk_ind,val in enumerate(bulk_sigs.index):
        m = centroid_data.ix[centroid_ind]
        h = bulk_sigs.ix[val]
        # Linear Regression
        lr = linregress(m, h)
        slope = lr[0]
        intercept = lr[1]
        rvalue = lr[2]
        pvalue = lr[3]
        stderr = lr[4]
        rvalues[centroid_ind,bulk_ind] = rvalue
        pvalues[centroid_ind,bulk_ind] = pvalue

# Generate Dataframe
rvalues = pd.DataFrame(
    data=rvalues,
    columns=list(bulk_sigs.index),
    index=centroid_data.index)
rvalues.columns.name = 'Patient Single Cell Lung Epithelium'

pvalues = pd.DataFrame(
    data=pvalues,
    columns=list(bulk_sigs.index),
    index=centroid_data.index)

pvalues.columns.name = 'Patient Single Cell Lung Epithelium'

# ROW COLORS
ind = list(rvalues.index) # CLUSTERS
row_colors = pd.Series(ind,index = ind).map(lut)

# PLOT CORRELATION OF CLUSTERS WITH BULK RNA Sequencing
method = 'average'
metric = 'euclidean'
linkage = hc.linkage(rvalues, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(rvalues.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)

# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
cl = hc.leaves_list(col_linkage)
mat = rvalues.ix[r1,cl]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(15,round(rvalues.shape[0]/3)))

# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.125,0.1,0.035,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors[r1]:
    pos = (x, y / len(r1))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(r1), color=c))
    if y >= len(r1)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

minplot = 0.20

# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.16,0.1,0.7,0.6])
im = axmatrix.matshow(mat[(pvalues < 0.05)*(np.abs(mat)>minplot)], aspect='auto', 
                      origin='lower', cmap=plt.cm.RdBu_r,vmin=-0.5,vmax=0.5)
xlabels = list(mat.columns)
ylabels = list(mat.index)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.yaxis.set_ticks_position('right')
xtick = plt.xticks(range(len(xlabels)), xlabels, rotation = 90, fontsize = 10,fontname='Arial')
ytick = plt.yticks(range(len(ylabels)), ylabels, rotation = 0, fontsize = 10,fontname='Arial')
axmatrix.grid(False)
plt.setp(axmatrix.spines.values(), linewidth=0.5)

# ADD COLORBAR
axcolor = fig.add_axes([0.16,0.71,0.7,0.03])
cbar = plt.colorbar(im, cax=axcolor, orientation = 'horizontal')
cbar.ax.xaxis.set_ticks_position('top')
cbar.outline.set_visible(False)

# ADD DENDROGRAM
matplotlib.rcParams['lines.linewidth'] = 0.5
sch.set_link_color_palette(['#000000', '#000000', '#000000', '#000000','#000000','#000000','#000000'])
ax2 = fig.add_axes([0.045,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#000000')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')

# SAVE FIGURE
figure_label = '_{}_CORRELATION_{}_DBGAPandGARVAN_{}_{}'.format(subset_type,meta,method,metric) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_ALL_CORRELATION_CELL_TYPE_DBGAPandGARVAN_average_euclidean

Patient mixing within annotated cell types

Cell type fraction per patient

In [44]:
subset_type = 'ALL'
meta = 'CELL_TYPE'
cm_flatui = FLATUI_CELL_TYPE

datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))

exec('tissue_cluster_sizes = QUERY.groupby(level=[\'Legend\', \'{}\'], axis=0).size().unstack().fillna(0)'.format(meta))

# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(cm_flatui),3))
for ii,hexcolor in enumerate(cm_flatui):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
metacell_colors = [rgb2hex(int(color[0]*255), int(color[1]*255), int(color[2]*255)) for color in colors]

# BARPLOT SHOWING FRACTION CELL TYPE DETECTED PER PATIENT SAMPLE
plt.figure(figsize = (5,2))
fig.canvas.draw()
sns.set_style("white")
ax = plt.gca()
legend_kwargs={'ncol': 2, 'borderaxespad':0.,'bbox_to_anchor':(1, 0)},
r1 = ['RU675_NORMAL_1ANS_UTR','RU682_NORMAL_2AS_UTR','RU684_NORMAL_1AS_UTR','RU685_NORMAL_1ANS_UTR',
     'RU653_TUMOR_1AS_UTR','RU661_TUMOR_1AS_UTR','RU675_TUMOR_1ANS_UTR','RU676_TUMOR_1AS_UTR',
     'RU679_TUMOR_2AS_TR','RU680_TUMOR_1AS_UTR','RU682_TUMOR_2AS_UTR','RU684_TUMOR_1AS_UTR',
     'RU255B_BRAINMET_4S_TR','RU666_SPINEMET_4NS_TR','RU681_BRAINMET_4S_TR','RU699_ADRENALMET_4','RU701_BRAINMET_4'][::-1]
tissue_cluster_sizes.div(tissue_cluster_sizes.sum(axis=1),axis=0).ix[r1].plot.barh(stacked=True, 
                                                                                   color=colors, 
                                                                                   ax = ax, 
                                                                                   width = 0.95)
plt.xticks(fontsize = 8)
plt.yticks(fontsize = 8)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)

ax.legend_.remove()
plt.tight_layout()

# SAVE FIGURE
figure_label = '_{}_{}perSample_barplot'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', dpi=fig_dpi, transparent=True)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_ALL_CELL_TYPEperSample_barplot

Compare cell type abundance in normal lung, primary tumours and metastases

In [45]:
# SOURCE, CELL TYPE, FRACTION
vals = list(tissue_cluster_sizes.index)

classes = []
for name in vals:
    if 'MET' in name:
        classes.append('METASTASIS')
    elif 'NORMAL' in name:
        classes.append('NORMAL')
    else: 
        classes.append('PRIMARY')
        
tissue_cluster_sizes['META SOURCE'] = classes

tissue_cluster_sizes = tissue_cluster_sizes.set_index('META SOURCE')
tissue_cluster_fractions = tissue_cluster_sizes.div(tissue_cluster_sizes.sum(axis=1), axis=0)

# FORMAT BOXPLOT DATA
boxplot_data = []
pseudocount = 0.0001
for ii in np.arange(tissue_cluster_fractions.shape[0]):
    for jj in np.arange(tissue_cluster_fractions.shape[1]):
        boxplot_data.append({'META SOURCE':tissue_cluster_fractions.index[ii], 
                             'CELL TYPE':tissue_cluster_fractions.columns[jj], 
                             'FRACTION':np.log10(tissue_cluster_fractions.iloc[ii,jj]+pseudocount)})
boxplot_data = pd.DataFrame(boxplot_data)

palette = {
    'NORMAL': '#E6E6FA',
    'PRIMARY': '#000000',
    'METASTASIS': '#FF0000'
}

# GROUPED BOXPLOT WITH DATA POINTS OVERLAYED
fig = plt.figure(figsize = (20,6))
ax1 = plt.gca()
sns.set_style("white")
labels = ['EPITHELIAL','PROLIFERATING MESENCHYMAL PROGENITOR','FIBROBLAST','PERICYTE','ENDOTHELIAL',
         'MONOCYTE', 'NEUTROPHIL','MDSC','MACROPHAGE','DENDRITIC', 'DENDRITIC (ACTIVATED)',
         'MICROGLIA/MACROPHAGE','NK', 'NKT','Th', 'Tm', 'Treg','Breg','IG','MAST']

g = sns.boxplot(x="CELL TYPE", y="FRACTION", hue="META SOURCE",data=boxplot_data, palette=palette,
                hue_order = ['NORMAL','PRIMARY','METASTASIS'],
                order = labels, fliersize = 4, showmeans=False,linewidth = 1)
g = sns.swarmplot(x="CELL TYPE", y="FRACTION", hue="META SOURCE",data=boxplot_data, 
                  palette={'NORMAL': '#E6E6FA','PRIMARY': '#606060','METASTASIS': '#FF0000'},
                  size = 5,edgecolor='#000000', linewidth=1,
                  hue_order = ['NORMAL','PRIMARY','METASTASIS'],order = labels, split=True)
g.set_ylabel("\n\n\nLog10Fraction",fontsize=14,fontname = 'Arial')
g.set_xticklabels(labels,rotation=90,fontname = 'Arial',fontsize = 10)
g.tick_params(labelsize=14)
sns.despine()

ax1.legend_.remove()

# SAVE FIGURE
figure_label = '{}_CELLTYPE_FRACTION_BY_SOURCE_BOXPLOT_LOG'.format(subset_type).replace(' ','_')
fn = FIG_output_stem  + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDALL_CELLTYPE_FRACTION_BY_SOURCE_BOXPLOT_LOG
In [46]:
#The Kruskal-Wallis H-test tests the null hypothesis that the population median of all of the groups are equal. 
#It is a non-parametric version of ANOVA. The test works on 2 or more independent samples, which may have different sizes. 
#Note that rejecting the null hypothesis does not indicate which of the groups differs. 
#Post-hoc comparisons between groups are required to determine which groups are different.

# ANOVA BY CELL TYPE (assumes normal distribution)
query_types= []
for cell_query in list(tissue_cluster_fractions.columns):
    
    # SUBSET DATA FROM CELL TYPE
    tmp = tissue_cluster_fractions[cell_query]

    # Perform the ANOVA
    if (stats.kruskal(tmp['NORMAL'], tmp['PRIMARY'], tmp['METASTASIS'])[1] < 0.05):
        print(cell_query)
        print(stats.kruskal(tmp['NORMAL'], tmp['PRIMARY'], tmp['METASTASIS']))
        query_types.append(cell_query)

print(query_types)
Breg
KruskalResult(statistic=7.03235294117647, pvalue=0.02971282611750145)
DENDRITIC
KruskalResult(statistic=6.075490196078427, pvalue=0.04794287418200022)
ENDOTHELIAL
KruskalResult(statistic=7.517647058823528, pvalue=0.02331114913026814)
FIBROBLAST
KruskalResult(statistic=6.085294117647059, pvalue=0.047708435167748434)
MAST
KruskalResult(statistic=6.414705882352941, pvalue=0.04046358111390589)
NEUTROPHIL
KruskalResult(statistic=6.91730061349693, pvalue=0.03147221119630172)
NK
KruskalResult(statistic=8.717647058823523, pvalue=0.012793429893414327)
['Breg', 'DENDRITIC', 'ENDOTHELIAL', 'FIBROBLAST', 'MAST', 'NEUTROPHIL', 'NK']

Compute patient entropy within each cell type

In [47]:
# RANGE OF CLUSTER SIZES
meta = 'CELL_TYPE'
FLATUI_PLOT = FLATUI_CELL_TYPE

exec('QUERY = INDF_{}'.format(subset_type))
cell_types = np.unique(QUERY.index.get_level_values(meta))

# To evaluate patient mixing within each cell type, we: 
# (1) randomly sampling ncells per cell type
# (2) Compute the Shannon Diversity index based on sample ID within the sampled subset. 
# (3) Repeat nreps times to achieve a range of entropy values per cell type. 
ncells = 100 # randomly sample this number of cells within each cell type
nreps = 100 # repeat sampling this many times

# GENERATE COLORMAP
# Palatte for Class METADATA
ind = QUERY.index.get_level_values(meta)
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))

# ENTROPY FOR CELL SUBSETS
H = pd.DataFrame()

for cell_type in cell_types:
    
    # SUBSET CELLS BELONGING TO SPECIFIC CELL TYPE
    CELL_TYPE_SUBSET = QUERY.select(lambda x: x[[ind for ind, name in enumerate(QUERY.index.names) if name==meta][0]] in [cell_type])

    H_sample = []

    for rep in np.arange(nreps):
        # Randomly sample 100 Cells from cluster
        sampling = CELL_TYPE_SUBSET.sample(n=ncells,replace=True)
    
        # Compute probability of cells sampled for individual patients
        labels = sampling.index.get_level_values('Legend')
        probs = [np.mean(labels == c) for c in set(labels)]
        # Compute entropy of distribution across patients
        H_sample.append(np.sum(-p * np.log(p) for p in probs))
    
    # Results for cell type 
    H = H.append(pd.DataFrame({'Entropy across patients': H_sample, 'Cell Type':[cell_type]*nreps}))
In [48]:
# VISUALIZE PATIENT ENTROPY BY RANKED BOXPLOT
fig = plt.figure(figsize = (8,3))
ax1 = plt.gca()
sns.set_style("white")

# Rank boxplots by entropy
plot_order = H.set_index('Cell Type').groupby('Cell Type').mean().sort_values(by=['Entropy across patients']).index

g = sns.boxplot(x="Cell Type", y="Entropy across patients",data=H, palette=palette,
                fliersize = 4, showmeans=False,linewidth = 1, order = plot_order)
g.set_ylabel("Entropy across patients",fontsize=10,fontname = 'Arial')
g.set_xticklabels(plot_order,rotation=90,fontname = 'Arial',fontsize = 10)
g.tick_params(labelsize=10)
sns.despine()

# SAVE FIGURE
figure_label = '{}_PATIENT_ENTROPY_PER_{}'.format(subset_type,meta).replace(' ','_')
fn = FIG_output_stem  + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDALL_PATIENT_ENTROPY_PER_CELL_TYPE

Lineage annotation within epithelial cells

Description

In this section, we analyze at greater resolution the epithelial compartment to better understand the relationship between normal lung epithelial cell types and cancer; as well as tumor-cell intrinsic adaptations.

The normal, adult human lung

In [49]:
subset_type = 'EPITHELIAL_NOR'

Load data subset

In [50]:
# LOAD DATA
# RAW COUNTS (Cells x Genes)
exec('DF_{} = h5_data.load(\'/DF_{}\')'.format(subset_type,subset_type))

# MEDIAN LIBRARY-SIZE NORMALIZED COUNTS (Cells x Genes)
exec('NDF_{} = h5_data.load(\'/NDF_{}\')'.format(subset_type,subset_type))

# MAGIC IMPUTED AND NORMALIZED DATA (Cells x Genes)
exec('INDF_{} = h5_data.load(\'/INDF_{}\')'.format(subset_type,subset_type))

# CELLS X DIMENSIONS (including principal components, diffusion components, ect.)
exec('DIMENSIONS_{} = h5_data.load(\'/DIMENSIONS_{}\')'.format(subset_type,subset_type))

# RANKED DIFFERENTIALLY EXPRESSED GENES BY MAST (EACH CELL TYPE VS. ALL OTHER CELLS)
exec('GENE_RANK_MAST_{} = h5_data.load(\'/GENE_RANK_MAST_{}\')'.format(subset_type,subset_type))

Visualize Phenograph clusters and lineage assignments

In [51]:
dimtype = 'ForceDirected' # we now use force directed layout for visualization, because anlayzing cell state heterogneiety within a given cell type
meta = 'CELL_TYPE'
exec('QUERY = INDF_{}'.format(subset_type))
exec('DIM = DIMENSIONS_{}'.format(subset_type))
x = DIM['{}0'.format(dimtype)]
y = DIM['{}1'.format(dimtype)]

# XY Parameters
plt.figure(figsize = (15,5))
gs1 = gridspec.GridSpec(1,3)
gs1.update(wspace=0.1, hspace=0.1) # set the spacing between axes. 
dot_size = 5

xmax = np.abs(x).max()
ymax = np.abs(y).max()
axis_min = -ceil(ceil(max(xmax,ymax)/10)*10)
axis_max = ceil(ceil(max(xmax,ymax)/10)*10)

# Sample ID
ax1 = plt.subplot(gs1[0])
seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('Legend'), 
                              cmap=CM_NORMAL,
                              legend_kwargs={'ncol': 1, 'borderaxespad':0.,
                                            'bbox_to_anchor':(1, 0)}, 
                              s=dot_size, ax=ax1);
sns.despine()
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
sns.despine()

# Phenograph Cluster
ax2 = plt.subplot(gs1[1])
g = seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('PHENOGRAPH_CLASS'), 
                                  cmap= CM_PHENOGRAPH_CLASS_NORMAL, 
                                  legend_kwargs={'ncol': 3, 'borderaxespad':0.,
                                            'bbox_to_anchor':(0.75, 0)},
                                  s=dot_size,randomize=True, ax = ax2);
ax2.tick_params(labelsize=14)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax2.axis('off')

# Cluster Assignment
ax3 = plt.subplot(gs1[2])
g = seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('CELL_TYPE'), 
                                  cmap= CM_CLASS_NORMAL_LINEAGES, 
                                  legend_kwargs={'ncol': 1, 'borderaxespad':0.,
                                            'bbox_to_anchor':(1, 0)},
                                  s=dot_size,randomize=True, ax = ax3);
h,l=g.get_legend_handles_labels()
ax3.tick_params(labelsize=14)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax3.axis('off')

# SAVE FIGURE
figure_label = '_{}_LEGEND_PHENOGRAPH_CELLTYPE_{}'.format(subset_type,dimtype)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_EPITHELIAL_NOR_LEGEND_PHENOGRAPH_CELLTYPE_ForceDirected

Query marker expression by lineage

Dot plot of un-imputed data

In [52]:
# SPECIFY GROUPBY PARAMETER AND ASSOCIATED COLORMAP
meta='CELL_TYPE'
FLATUI_PLOT = FLATUI_CLASS_NORMAL_LINEAGES

# SPECIFY DATA AND GROUP CELLS BY THIS PARAMETER
exec('QUERY = NDF_{}'.format(subset_type))
     
# RANK GROUPED CELLS BY CATEGORIES (ORDERED)
categories = ['AE2','AE1','CLUB','CILIATED']

marker_genes_dict = {
                     'AEC2': ['SFTPA','SFTPB','SFTPC','SFTPD','LAMP3','SLC34A2','ABCA3','AQP3'],
                     'AEC1': ['PDPN','HOPX','AGER','CAV1','SEMA3B'],
                     'CLUB': ['SCGB1A1','SCGB3A1','SCGB3A2','CYP2F1'],
                     'CILIATED':['SOX2','FOXJ1','CETN2','TUB1A1','TEKT1','CCDC78','DNAF1'],

                    }
# SPECIFY ORDER OF PLOT GENES
var_group_labels = ['AEC2','AEC1','CLUB','CILIATED']

# CONVERT CATEGORIZED GENE DICTIONARY TO MARKER_GENES WITH POSITION ANNOTATED
tmp = pd.DataFrame()
for gene_group in var_group_labels:
    genes = marker_genes_dict[gene_group]
    genes = [gene for gene in genes if gene in QUERY.columns]
    labels = [gene_group] * len(genes)
    current = pd.DataFrame({'labels':labels,'genes':genes})
    tmp = tmp.append(current)
tmp.index = np.arange(len(tmp))
marker_genes = tmp['genes'].values

var_group_positions = []
for gene_group in var_group_labels:
    cpositions = list(tmp[tmp['labels'] ==gene_group].index)
    var_group_positions.append((cpositions[0],cpositions[len(cpositions)-1]))

# DOT PLOT OF GENES LABELED BY CATEGORY
fig = plt.figure(figsize=(5,10))
dotplot(QUERY[marker_genes], marker_genes, categories, groupby=meta, use_raw=None, log=False,
        expression_cutoff=0, mean_only_expressed=True, color_map='Reds', dot_max=0.75,
        dot_min=0.10, figsize=None, dendrogram=False, gene_symbols=None,
        var_group_positions=var_group_positions, standard_scale = 'var', smallest_dot=0,
        var_group_labels=var_group_labels, var_group_rotation=90, layer=None, show=None,
        save=None)

# COLORMAP LUT BASED ON QUERY & MET
ind = categories
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors = pd.Series(ind,index = ind).map(lut)

# SAVE FIGURE
figure_label = '_{}_dotPlot_markerGenes_by{}'.format(subset_type,meta) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_EPITHELIAL_NOR_dotPlot_markerGenes_byCELL_TYPE
<Figure size 360x720 with 0 Axes>

Matrix plot of imputed data

In [53]:
# SPECIFY GROUPBY PARAMETER AND ASSOCIATED COLORMAP
meta='CELL_TYPE'
FLATUI_PLOT = FLATUI_CLASS_NORMAL_LINEAGES

# ROW INDEX
genes = marker_genes
exec('QUERY = INDF_{}'.format(subset_type))

# SELECT UNIQUE, DE GENES
genes,ind = np.unique(genes, return_index=True)

# CONSTRUCT HEATMAP DATA
genes = [g for g in genes if g in list(QUERY.columns)] # Detected genes
heatmap_data = pd.DataFrame(data = zscore(QUERY[genes].values,axis=0),columns = genes, index = QUERY.index).groupby(level= meta, axis=0).median() #CLUSTERS

yticks = heatmap_data.index
xticks = heatmap_data.columns

# Palatte for Class METADATA
ind = list(heatmap_data.index) # CLUSTERS
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors = pd.Series(ind,index = ind).map(lut)

# LINKAGE 
method = 'average' # average, single
metric = 'correlation' # cosine
linkage = hc.linkage(heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)

# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
cl = hc.leaves_list(col_linkage)
mat = heatmap_data.iloc[r1,cl]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(15,5))
plt.rcParams["axes.grid"] = False
import matplotlib.patches as patches

# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.080,0.1,0.02,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors[list(mat.index)]:
    pos = (x, y / len(mat.index))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(mat.index), color=c))
    if y >= len(r1)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.1,0.1,0.7,0.6])
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=plt.cm.RdBu_r,vmin=-3,vmax=3)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 14)

# ADD DENDROGRAM
sch.set_link_color_palette(['#808080', '#808080', '#808080', '#808080','#808080','#808080','#808080'])
ax2 = fig.add_axes([0,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#808080')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')

# ADD COLORBAR
axcolor = fig.add_axes([0.81,0.1,0.01,0.6])
cbar = plt.colorbar(im, cax=axcolor)
cbar.ax.get_yaxis().labelpad = 20
cbar.ax.tick_params(labelsize=10)
cbar.ax.set_ylabel('Median Expression per {}'.format(meta), rotation=270, fontsize = 10)

# SAVE FIGURE
figure_label = '_{}_matrixPlotMedian_markerGenes_by{}'.format(subset_type,meta) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_EPITHELIAL_NOR_matrixPlotMedian_markerGenes_byCELL_TYPE

Heatmap of top DEG at the level of single cells

In [54]:
# SELECT TOP <top_cutoff> DE GENES BY CELL TYPE
top_cutoff = 60 
meta = 'PHENOGRAPH_CLASS'
exec('class_names = [name for name in np.unique(GENE_RANK_MAST_{}.columns) if \'{}\' in name]'.format(subset_type,meta))
gene_array = pd.DataFrame()
for title in class_names:
    exec('tmp = GENE_RANK_MAST_{}[title]'.format(subset_type))
    tmp = tmp.sort_values(axis=0, ascending=False)
    gene_subset = list(tmp.index)
    strs = [title for x in range(len(gene_subset))]
    cdf = pd.DataFrame({'Gene':gene_subset[:top_cutoff], meta: strs[:top_cutoff]})
    gene_array = pd.concat([gene_array,cdf],axis=0)
In [55]:
# COLOR ROWS BY ANNOTATED CELL TYPE
exec('ind = INDF_{}.index.get_level_values(\'CELL_TYPE\')'.format(subset_type))
FLATUI_PLOT = FLATUI_PHENOGRAPH_CLASS_NORMAL

# Palatte for Class METADATA
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors = pd.Series(ind).map(lut)

# COLUMN INDEX AND COLORS
genes = np.unique(gene_array['Gene'].values)

# CONSTRUCT HEATMAP DATA
exec('heatmap_data = pd.DataFrame(data = zscore(INDF_{}[genes].values,axis=0),columns = genes)'.format(subset_type))
yticks = heatmap_data.index
xticks = heatmap_data.columns

# LINKAGE 
method = 'average' # average, single
metric = 'cosine' # cosine
linkage = hc.linkage(heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)

# CLUSTERED HEATMAP OF DE GENES BY SUBPOPULATION,Only showing DE
sns.set(font_scale=0.5)

g = sns.clustermap(heatmap_data,
                   row_linkage = row_linkage, col_linkage=col_linkage,  
                   figsize = (20,18),linewidth=0,row_colors = row_colors,
                   cmap = plt.cm.RdBu_r,vmin = -3, vmax = 3)
ax = g.ax_heatmap
ax.set_yticklabels([]);

# Only label marker genes for simplified visual
labels = [item.get_text() for item in ax.get_xticklabels()]
for ind,label in enumerate(labels):
    if label not in marker_genes:
        labels[ind] = ''
        
h = ax.set_xticklabels(labels, rotation=90, fontsize=20);

# SAVE FIGURE
figure_label = '_{}_{}_HEATMAP_TOP{}_DE_GENES_{}_{}_INDF'.format(meta,subset_type,top_cutoff,method,metric)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
g.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_PHENOGRAPH_CLASS_EPITHELIAL_NOR_HEATMAP_TOP60_DE_GENES_average_cosine_INDF

Stacked violin plots of imputed data

In [56]:
meta='CELL_TYPE'
FLATUI_PLOT = FLATUI_CLASS_NORMAL_LINEAGES

exec('QUERY = INDF_{}'.format(subset_type))

plot_order = ['AE2','AE1','CLUB','CILIATED']
genes = marker_genes

# COLOR ROWS BY <meta>
ind = QUERY.index.get_level_values(meta)
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))

# IMPUTED EXPRESSION PER GENE TAGGED BY METADATA in VIOLIN DATAFRAME
violin_data = pd.DataFrame(data = zscore(QUERY[genes],axis=0), columns = genes, index = QUERY.index)
violin_data = pd.DataFrame(data = violin_data.values, columns = violin_data.columns, index = violin_data.index.get_level_values(meta)).stack()
violin_data = pd.DataFrame({meta: violin_data.index.get_level_values(0), 
                            'gene':violin_data.index.get_level_values(1),
                            'Expression':violin_data.values})
# (0,0)  Plot violin plot
N = len(plot_order)

# GENERATE AXIS PER VIOLIN PLOT
fig = plt.figure(figsize = (3,N*1))
sns.set_style("white")

for iii,cell_type in enumerate(plot_order):
    ax1 = plt.subplot2grid((N, 1), (iii, 0), colspan=1)
    g = sns.violinplot(x="gene", y="Expression", data=violin_data[violin_data[meta]==cell_type], 
                       inner = 'quart',orient = 'v',bw = 0.15, linewidth = 0.1, 
                       scale = 'count',color = palette[cell_type],ax = ax1)
    g.set_ylabel(" ")
    g.set_xlabel(" ")
    g.tick_params(axis='y',labelsize=8, rotation = 0)
    plt.ylim(0,5)
    plt.title("{}".format(cell_type.title()),fontsize=8,fontname = 'Arial',loc = 'left',
              rotation = 'horizontal', ha = 'left', pad = 1.5)

    ax1.spines['top'].set_linewidth(0.5)
    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_linewidth(0.5)
    ax1.spines['bottom'].set_linewidth(0.5)
    ax1.spines['left'].set_linewidth(0.5)
    ax1.spines['right'].set_visible(False)
    
    if iii != (N-1):
        ax1.tick_params(axis='x', bottom=False, labelbottom=False)
    else:
        ax1.tick_params(axis='x', labelsize=8, rotation = 90)
        
# SAVE FIGURE
figure_label = '_{}_stackedViolin_markerGenes_by{}'.format(subset_type,meta) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_EPITHELIAL_NOR_stackedViolin_markerGenes_byCELL_TYPE

Lineage fraction per patient

In [57]:
meta = 'CELL_TYPE'
cm_flatui = FLATUI_CLASS_NORMAL_LINEAGES

datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))

exec('tissue_cluster_sizes = QUERY.groupby(level=[\'Legend\', \'{}\'], axis=0).size().unstack().fillna(0)'.format(meta))

# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(cm_flatui),3))
for ii,hexcolor in enumerate(cm_flatui):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
metacell_colors = [rgb2hex(int(color[0]*255), int(color[1]*255), int(color[2]*255)) for color in colors]

# BARPLOT SHOWING FRACTION CELL TYPE DETECTED PER PATIENT SAMPLE
plt.figure(figsize = (6,2))
ax = plt.gca()
legend_kwargs={'ncol': 2, 'borderaxespad':0.,'bbox_to_anchor':(1, 0)},
r1 = ['RU675_NORMAL_1ANS_UTR','RU682_NORMAL_2AS_UTR','RU684_NORMAL_1AS_UTR','RU685_NORMAL_1ANS_UTR'][::-1]
tissue_cluster_sizes.div(tissue_cluster_sizes.sum(axis=1),axis=0).ix[r1].plot.barh(stacked=True, 
                                                                                   color=colors, 
                                                                                   ax = ax,                                                                                   width = 0.95)
plt.xticks(fontsize = 8)
plt.yticks(fontsize = 8)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)

ax.legend_.remove()
plt.tight_layout()

plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    top=False,         # ticks along the top edge are off
    labelbottom=False) # labels along the bottom edge are off

plt.tick_params(
    axis='y',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    left=False,      # ticks along the bottom edge are off
    right=False,         # ticks along the top edge are off
    labelleft=True) # labels along the bottom edge are off

# SAVE FIGURE
figure_label = '_{}_{}perSample_barplot'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', dpi=fig_dpi, transparent=True)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_EPITHELIAL_NOR_CELL_TYPEperSample_barplot

The merged normal lung and primary tumour cells

In [58]:
subset_type = 'EPITHELIAL_NOR_TUMOR'

Load data subset

In [59]:
# LOAD DATA
# RAW COUNTS (Cells x Genes)
exec('DF_{} = h5_data.load(\'/DF_{}\')'.format(subset_type,subset_type))

# MEDIAN LIBRARY-SIZE NORMALIZED COUNTS (Cells x Genes)
exec('NDF_{} = h5_data.load(\'/NDF_{}\')'.format(subset_type,subset_type))

# MAGIC IMPUTED AND NORMALIZED DATA (Cells x Genes)
exec('INDF_{} = h5_data.load(\'/INDF_{}\')'.format(subset_type,subset_type))

# CELLS X DIMENSIONS (including principal components, diffusion components, ect.)
exec('DIMENSIONS_{} = h5_data.load(\'/DIMENSIONS_{}\')'.format(subset_type,subset_type))

# RANKED DIFFERENTIALLY EXPRESSED GENES BY MAST (EACH CELL TYPE VS. ALL OTHER CELLS)
exec('GENE_RANK_MAST_{} = h5_data.load(\'/GENE_RANK_MAST_{}\')'.format(subset_type,subset_type))

Visualize Phenograph clusters and lineage assignments

In [60]:
dimtype = 'ForceDirected'
meta = 'CELL_TYPE'
exec('QUERY = INDF_{}'.format(subset_type))
exec('DIM = DIMENSIONS_{}'.format(subset_type))
x = DIM['{}0'.format(dimtype)]
y = DIM['{}1'.format(dimtype)]

# XY Parameters
plt.figure(figsize = (15,5))
gs1 = gridspec.GridSpec(1,3)
gs1.update(wspace=0.1, hspace=0.1) # set the spacing between axes. 
dot_size = 5

xmax = np.abs(x).max()
ymax = np.abs(y).max()
axis_min = -ceil(ceil(max(xmax,ymax)/10)*10)
axis_max = ceil(ceil(max(xmax,ymax)/10)*10)

# SAMPLE ID: LEGEND
ax1 = plt.subplot(gs1[0])
seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('Legend'), 
                              cmap=CM_CANCERandNORMAL,
                              legend_kwargs={'ncol': 1, 'borderaxespad':0.,
                                            'bbox_to_anchor':(1, 0)}, 
                              s=dot_size, ax=ax1);
sns.despine()
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
sns.despine()

# PHENOGRAPH CLUSTERS
ax2 = plt.subplot(gs1[1])
g = seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('PHENOGRAPH_CLASS'), 
                                  cmap= CM_CLASS, 
                                  legend_kwargs={'ncol': 3, 'borderaxespad':0.,
                                            'bbox_to_anchor':(0.75, 0)},
                                  s=dot_size,randomize=True, ax = ax2);
ax2.tick_params(labelsize=14)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax2.axis('off')

# CELL TYPE ASSIGNMENTS (STEM/PROGENITOR IS THE MIXED LINEAGE STATE)
ax3 = plt.subplot(gs1[2])
g = seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('CELL_TYPE'), 
                                  cmap= CM_LINEAGE, 
                                  legend_kwargs={'ncol': 1, 'borderaxespad':0.,
                                            'bbox_to_anchor':(1, 0)},
                                  s=dot_size,randomize=True, ax = ax3);
h,l=g.get_legend_handles_labels()
ax3.tick_params(labelsize=14)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax3.axis('off')

# SAVE FIGURE
figure_label = '_{}_LEGEND_PHENOGRAPH_CELLTYPE_{}'.format(subset_type,dimtype)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_EPITHELIAL_NOR_TUMOR_LEGEND_PHENOGRAPH_CELLTYPE_ForceDirected

Query marker expression by lineage

Load canonical marker gene lists per cell type

In [61]:
# LOAD GENESETS
# ADD REFERENCE TO SUPPLEMENTARY FILE
filename = 'lung_epithelial_lineage_signatures'
path_to_genesets = DATA_PATH+'{}.csv'.format(filename)
genesets = pd.read_csv(path_to_genesets,header='infer')

genesets_include = ['AEP_MORRISEY_2FC','AEC1_ME18p5_GENES_LG','AEC2_ME18p5_GENES_LG','CLUB_ME18p5_GENES_LG','CILIATED_ME18p5_GENES_LG','LECTIN+KRT5+BASAL','PNECS','MUCINOUS']
lineage_genes = np.unique(genesets[genesets_include].values.ravel().tolist())

lut = {}
for geneset in genesets_include:
    current_genes = [gene for gene in np.unique(genesets[geneset].values.ravel().tolist()) if gene != 'nan']
    current_label = [geneset]*len(current_genes)
    current_lut = dict(zip(current_genes,current_label))
    lut = dict(lut, **current_lut)

Dot plot of un-imputed data

In [62]:
# SPECIFY GROUPBY PARAMETER AND ASSOCIATED COLORMAP
meta='CELL_TYPE'
FLATUI_PLOT = FLATUI_LINEAGE

# SPECIFY DATA AND GROUP CELLS BY THIS PARAMETER
exec('QUERY = NDF_{}'.format(subset_type))
     
# RANK GROUPED CELLS BY CATEGORIES (ORDERED)
categories = ['AEC2','AEC1','CLUB','CILIATED','KRT5+BASAL','PNEC','AEP/MUCINOUS','STEM/PROGENITOR']

marker_genes_dict = {
                     'AEC2': ['SFTPA','SFTPB','SFTPC','SFTPD','LAMP3','SLC34A2','ABCA3','AQP3'],
                     'AEC1': ['PDPN','HOPX','AGER','CAV1','SEMA3B'],
                     'CLUB': ['SCGB1A1','SCGB3A1','SCGB3A2','CYP2F1'],
                     'CILIATED':['FOXJ1','CETN2','TUB1A1','TEKT1','CCDC78','DNAF1'],
                     'BASAL':['KRT14', 'APOE','EMP3','APOC1','CD44'],
                     'NE': ['UCHL1','ASCL1','CALCA'],
                     'AEP': ['TM4SF1','MUC5B','LRP5'],
                     'PROGENITOR TF': ['SOX2','SOX9','ID2','NKX2-1']
                    }
# SPECIFY ORDER OF PLOT GENES
var_group_labels = ['AEC2','AEC1','CLUB','CILIATED','BASAL','NE','AEP','PROGENITOR TF']

# CONVERT CATEGORIZED GENE DICTIONARY TO MARKER_GENES WITH POSITION ANNOTATED
tmp = pd.DataFrame()
for gene_group in var_group_labels:
    genes = marker_genes_dict[gene_group]
    genes = [gene for gene in genes if gene in QUERY.columns]
    labels = [gene_group] * len(genes)
    current = pd.DataFrame({'labels':labels,'genes':genes})
    tmp = tmp.append(current)
tmp.index = np.arange(len(tmp))
marker_genes = tmp['genes'].values

var_group_positions = []
for gene_group in var_group_labels:
    cpositions = list(tmp[tmp['labels'] ==gene_group].index)
    var_group_positions.append((cpositions[0],cpositions[len(cpositions)-1]))

# DOT PLOT OF GENES LABELED BY CATEGORY
fig = plt.figure(figsize=(5,10))
sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})

dotplot(QUERY[marker_genes], marker_genes, categories, groupby=meta, use_raw=None, log=False,
        expression_cutoff=0, mean_only_expressed=True, color_map='Reds', dot_max=0.75,
        dot_min=0.10, figsize=None, dendrogram=False, gene_symbols=None,
        var_group_positions=var_group_positions, standard_scale = 'var', smallest_dot=0,
        var_group_labels=var_group_labels, var_group_rotation=90, layer=None, show=None,
        save=None)


# COLORMAP LUT BASED ON QUERY & MET
ind = categories
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors = pd.Series(ind,index = ind).map(lut)

# SAVE FIGURE
figure_label = '_{}_dotPlot_markerGenes_by{}'.format(subset_type,meta) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_EPITHELIAL_NOR_TUMOR_dotPlot_markerGenes_byCELL_TYPE
<Figure size 360x720 with 0 Axes>

Stacked violin plots of imputed data

In [63]:
meta='CELL_TYPE'
FLATUI_PLOT =  FLATUI_LINEAGE

exec('QUERY = INDF_{}'.format(subset_type))

plot_order = ['AEC2','AEC1','CLUB','CILIATED','KRT5+BASAL','PNEC','AEP/MUCINOUS','STEM/PROGENITOR']
genes = marker_genes

# COLOR ROWS BY <meta>
ind = QUERY.index.get_level_values(meta)
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))

# IMPUTED EXPRESSION PER GENE TAGGED BY METADATA in VIOLIN DATAFRAME
violin_data = pd.DataFrame(data = zscore(QUERY[genes],axis=0), columns = genes, index = QUERY.index)
violin_data = pd.DataFrame(data = violin_data.values, columns = violin_data.columns, index = violin_data.index.get_level_values(meta)).stack()
violin_data = pd.DataFrame({meta: violin_data.index.get_level_values(0), 
                            'gene':violin_data.index.get_level_values(1),
                            'Expression':violin_data.values})

# (0,0)  Plot violin plot
N = len(plot_order)

# GENERATE AXIS PER VIOLIN PLOT
fig = plt.figure(figsize = (4,N*0.80))
sns.set_style("white")

for iii,cell_type in enumerate(plot_order):
    ax1 = plt.subplot2grid((N, 1), (iii, 0), colspan=1)
    g = sns.violinplot(x="gene", y="Expression", data=violin_data[violin_data[meta]==cell_type], 
                       inner = 'quart',orient = 'v',bw = 0.15, linewidth = 0.1, 
                       scale = 'count',color = palette[cell_type],ax = ax1)
    g.set_ylabel(" ")
    g.set_xlabel(" ")
    g.tick_params(axis='y',labelsize=8, rotation = 0)
    plt.ylim(0,10)
    plt.title("{}".format(cell_type.title()),fontsize=6,fontname = 'Arial',loc = 'left',
              rotation = 'horizontal', ha = 'left', pad = 1.5)

    ax1.spines['top'].set_linewidth(0.5)
    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_linewidth(0.5)
    ax1.spines['bottom'].set_linewidth(0.5)
    ax1.spines['left'].set_linewidth(0.5)
    ax1.spines['right'].set_visible(False)
    
    if iii != (N-1):
        ax1.tick_params(axis='x', bottom=False, labelbottom=False)
    else:
        ax1.tick_params(axis='x', labelsize=8, rotation = 90)
        
# SAVE FIGURE
figure_label = '_{}_stackedViolin_markerGenes_by{}'.format(subset_type,meta) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_EPITHELIAL_NOR_TUMOR_stackedViolin_markerGenes_byCELL_TYPE

Stacked ridge plots of imputed data

In [64]:
gene_name = 'TM4SF1'
exec('QUERY = INDF_{}'.format(subset_type))

# Color rows by <meta>
ind = QUERY.index.get_level_values(meta)
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))

# IMPUTED EXPRESSION PER GENE TAGGED BY METADATA in VIOLIN DATAFRAME
violin_data = pd.DataFrame(data = zscore(QUERY,axis=0), columns = QUERY.columns, index = QUERY.index)
violin_data = pd.DataFrame(data = violin_data.values, columns = violin_data.columns, index = violin_data.index.get_level_values(meta)).stack()
violin_data = pd.DataFrame({meta: violin_data.index.get_level_values(0), 
                            'gene':violin_data.index.get_level_values(1),
                            'Expression':violin_data.values})


gene_data = violin_data[violin_data['gene']==gene_name]
row_order = list(gene_data.groupby(meta).median().sort_values(by=['Expression']).index)

sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})

# Initialize the FacetGrid object
pal = sns.cubehelix_palette(10, rot=-.25, light=.7)
g = sns.FacetGrid(gene_data, row=meta, 
                  hue = meta,aspect=10, size = 0.30,row_order = row_order,
                  palette=palette,col_wrap=None, sharex=True, sharey=True)

# Draw the densities in a few steps
g.map(sns.kdeplot, "Expression", clip_on=False, shade=True, alpha=1, lw=1, bw=.2, kernel = 'gau')
g.map(sns.kdeplot, "Expression", clip_on=False, color="w", lw=1, bw=.2, kernel = 'gau')
g.map(plt.axhline, y=0, lw=1, clip_on = False)

# Define and use a simple function to label the plot in axes coordinates
def label(x, color, label):
    ax = plt.gca()
    ax.text(0, 0.1, label,color=color,fontsize=8,
            ha="right", va="bottom", transform=ax.transAxes)
g.map(label, "Expression")

# Set the subplots to overlap
g.fig.subplots_adjust(hspace=-.75)

# Remove axes details that don't play well with overlap
g.set_titles("")
g.set(yticks=[])
g.despine(bottom=True, left=True)

plt.xlabel('KDE({} expression)'.format(gene_name.title()),fontsize=10)

# SAVE FIGURE
figure_label = '_{}_stackedKDE_{}_by{}'.format(subset_type,gene_name,meta) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_EPITHELIAL_NOR_TUMOR_stackedKDE_TM4SF1_byCELL_TYPE

Patient mixing within each lineage

Lineage fraction per patient

In [65]:
meta = 'CELL_TYPE'
cm_flatui = FLATUI_LINEAGE

datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))

exec('tissue_cluster_sizes = QUERY.groupby(level=[\'Legend\', \'{}\'], axis=0).size().unstack().fillna(0)'.format(meta))

# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(cm_flatui),3))
for ii,hexcolor in enumerate(cm_flatui):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
metacell_colors = [rgb2hex(int(color[0]*255), int(color[1]*255), int(color[2]*255)) for color in colors]

# BARPLOT SHOWING FRACTION CELL TYPE DETECTED PER PATIENT SAMPLE
plt.figure(figsize = (3,2))
ax = plt.gca()
legend_kwargs={'ncol': 2, 'borderaxespad':0.,'bbox_to_anchor':(1, 0)},
r1 = ["RU675_NORMAL_1ANS_UTR", "RU682_NORMAL_2AS_UTR", "RU684_NORMAL_1AS_UTR","RU685_NORMAL_1ANS_UTR",
          "RU653_TUMOR_1AS_UTR","RU661_TUMOR_1AS_UTR","RU675_TUMOR_1ANS_UTR","RU676_TUMOR_1AS_UTR",
          "RU679_TUMOR_2AS_TR","RU680_TUMOR_1AS_UTR","RU682_TUMOR_2AS_UTR","RU684_TUMOR_1AS_UTR"][::-1]
tissue_cluster_sizes.div(tissue_cluster_sizes.sum(axis=1),axis=0).ix[r1].plot.barh(stacked=True, 
                                                                                   color=colors, 
                                                                                   ax = ax, 
                                                                                   width = 0.95)
plt.xticks(fontsize = 8)
plt.yticks(fontsize = 8)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)

ax.legend_.remove()
plt.tight_layout()

plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    top=False,         # ticks along the top edge are off
    labelbottom=False) # labels along the bottom edge are off

plt.tick_params(
    axis='y',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    left=False,      # ticks along the bottom edge are off
    right=False,         # ticks along the top edge are off
    labelleft=True) # labels along the bottom edge are off

# SAVE FIGURE
figure_label = '_{}_{}perSample_barplot'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', dpi=fig_dpi, transparent=True)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_EPITHELIAL_NOR_TUMOR_CELL_TYPEperSample_barplot

Entropy of lineage across patients

In [66]:
# RANGE OF CLUSTER SIZES
meta = 'CELL_TYPE'
FLATUI_PLOT = FLATUI_LINEAGE
exec('QUERY = INDF_{}'.format(subset_type))

cell_types = np.unique(QUERY.index.get_level_values(meta))

# To evaluate patient mixing within each cell type, we: 
# (1) randomly sampling ncells per cell type
# (2) Compute the Shannon Diversity index based on sample ID within the sampled subset. 
# (3) Repeat nreps times to achieve a range of entropy values per cell type. 
ncells = 100 # randomly sample this number of cells within each cell type
nreps = 20 # repeat sampling this many times, scaled to cover sample size.

# GENERATE COLORMAP
ind = QUERY.index.get_level_values(meta)
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))

# ENTROPY FOR CELL SUBSETS
H = pd.DataFrame()

for cell_type in cell_types:
    
    # SUBSET CELLS BELONGING TO SPECIFIC CELL TYPE
    CELL_TYPE_SUBSET = QUERY.select(lambda x: x[[ind for ind, name in enumerate(QUERY.index.names) if name==meta][0]] in [cell_type])

    H_sample = []

    for rep in np.arange(nreps):
        # Randomly sample 100 Cells from cluster
        sampling = CELL_TYPE_SUBSET.sample(n=ncells,replace=True)
    
        # Compute probability of cells sampled for individual patients
        labels = sampling.index.get_level_values('Legend')
        probs = [np.mean(labels == c) for c in set(labels)]
        # Compute entropy of distribution across patients
        H_sample.append(np.sum(-p * np.log(p) for p in probs))
    
    # Results for cell type 
    H = H.append(pd.DataFrame({'Entropy across patients': H_sample, 'Cell Type':[cell_type]*nreps}))
In [67]:
# RANKED KDE OF ENTROPY TO ACCOMPANY HEATMAP
#### VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(0.6,5))
H_row_order = ['AEC2','AEC1','CLUB','CILIATED','KRT5+BASAL','PNEC','AEP/MUCINOUS','STEM/PROGENITOR']
sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})

# Initialize the FacetGrid object
g = sns.FacetGrid(H, row='Cell Type', hue = 'Cell Type',aspect=3, size = 0.30,row_order = H_row_order,
                  palette=palette,col_wrap=None, sharex=True, sharey=True)

# Draw the densities in a few steps
g.map(sns.kdeplot, "Entropy across patients", clip_on=False, shade=True, alpha=1, lw=1, bw=.1, kernel = 'gau')
g.map(sns.kdeplot, "Entropy across patients", clip_on=False, color="w", lw=1, bw=.1, kernel = 'gau')
g.map(plt.axhline, y=0, lw=1, clip_on = False)

# Define and use a simple function to label the plot in axes coordinates
def label(x, color, label):
    ax = plt.gca()
    ax.text(0, 0.1, label,color=color,fontsize=8,
            ha="right", va="bottom", transform=ax.transAxes)
g.map(label, "Entropy across patients")

# Set the subplots to overlap
g.fig.subplots_adjust(hspace=-0.25)

# Remove axes details that don't play well with overlap
g.set_titles("")
g.set(yticks=[])
g.despine(bottom=True, left=True)

plt.xlabel('KDE(Entropy across patients)',fontsize=10)

# SAVE FIGURE
figure_label = '{}_PATIENT_ENTROPY_PER_{}_STACKED_VIOLINS'.format(subset_type,meta).replace(' ','_')
fn = FIG_output_stem  + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDEPITHELIAL_NOR_TUMOR_PATIENT_ENTROPY_PER_CELL_TYPE_STACKED_VIOLINS
<Figure size 43.2x360 with 0 Axes>

Meta-source fraction per cell type

In [68]:
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))

tissue_cluster_sizes = QUERY.groupby(level=['CELL_TYPE', 'Meta-Source'], axis=0).size().unstack().fillna(0)

# CONVERT HEX TO RGB (FLATUI_CLASS)
FLATUI_PLOT = ['E6E6FA', # NORMAL
               '000000'] # PRIMARY
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
metacell_colors = [rgb2hex(int(color[0]*255), int(color[1]*255), int(color[2]*255)) for color in colors]

plt.figure(figsize = (2,1))
ax = plt.gca()
D = tissue_cluster_sizes.reindex(['AEC2','AEC1', 'CLUB', 'CILIATED','KRT5+BASAL','PNEC','AEP/MUCINOUS','STEM/PROGENITOR'])
D.div(D.sum(axis=1),axis=0).plot.bar(stacked=True, color=metacell_colors, ax = ax, width = 0.85)
ax.legend_.remove()
plt.axis('off')

# SAVE FIGURE
figure_label = '_{}Distribution_PerSource_{}_PathwayMerged_fraction'.format(meta,subset_type).replace('.','_')
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_CELL_TYPEDistribution_PerSource_EPITHELIAL_NOR_TUMOR_PathwayMerged_fraction

Lineage combinatorics in single cells

Binary heatmap of lineage-specific genes expressed in top quartile in single cells

In [69]:
# COMPUTED ON UN-IMPUTED, NORMALIZED DATA

# COLOR BY CELL_TYPE, SOURCE AND PHENOGRAPH CLASS
exec('ind = NDF_{}.index.get_level_values(\'CELL_TYPE\')'.format(subset_type))
exec('ind2 = NDF_{}.index.get_level_values(\'Meta-Source\')'.format(subset_type))
exec('ind3 = NDF_{}.index.get_level_values(\'PHENOGRAPH_CLASS\')'.format(subset_type))

# Palatte for Class METADATA
FLATUI_PLOT =  FLATUI_LINEAGE
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)

cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors1 = pd.Series(ind).map(lut)

# Palatte for Class METADATA
FLATUI_SOURCE_RX = [
                 'E6E6FA', # NORMAL
                 '000000', # PRIMARY
]

FLATUI_PLOT = FLATUI_SOURCE_RX
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)

cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind2)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind2), colors[cix,:]))
row_colors2 = pd.Series(ind2).map(lut)

FLATUI_PLOT = FLATUI_CLASS
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)

cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind3)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind3), colors[cix,:]))
row_colors3 = pd.Series(ind3).map(lut)

# COLUMN INDEX AND COLORS
exec('genes = [gene for gene in marker_genes if gene in NDF_{}.columns]'.format(subset_type))

# CONSTRUCT HEATMAP DATA
exec('heatmap_data = pd.DataFrame(data = NDF_{}[genes].values,columns = genes, index = NDF_{}.index).fillna(0)'.format(subset_type,subset_type))
binary_heatmap_data = heatmap_data>heatmap_data.quantile(q = 0.75,axis=0)
binary_heatmap_data = binary_heatmap_data.astype(int)

method = 'average' # average, single
metric = 'euclidean' # cosine
linkage = hc.linkage(binary_heatmap_data, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(binary_heatmap_data.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)

# CLUSTERED HEATMAP OF DE GENES BY SUBPOPULATION,Only showing DE
sns.set(font_scale=0.5)

g = sns.clustermap(binary_heatmap_data,
                   row_linkage = row_linkage,col_cluster=False,
                   linewidths=0.1,
                   figsize = (3,8),linewidth=0,row_colors=[row_colors3,row_colors2, row_colors1],
                   cmap = 'coolwarm')
ax = g.ax_heatmap
ax.set_yticklabels([]);
g.cax.set_visible(False)
        
h = ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=6);

# SAVE FIGURE
figure_label = '_{}_single_cell_lineageGeneBinary_heatmap_cellType_source_phenograph'.format(subset_type) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
g.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_EPITHELIAL_NOR_TUMOR_single_cell_lineageGeneBinary_heatmap_cellType_source_phenograph

Fraction of canonical lineage markers expressed per cell

In [70]:
filename = 'lung_epithelial_lineage_signatures'
path_to_genesets = DATA_PATH+'{}.csv'.format(filename)
genesets = pd.read_csv(path_to_genesets,header='infer')
genesets_include = ['AEP_MORRISEY_2FC','AEC1_ME18p5_GENES_LG','AEC2_ME18p5_GENES_LG','CLUB_ME18p5_GENES_LG','CILIATED_ME18p5_GENES_LG','LECTIN+KRT5+BASAL','PNECS','MUCINOUS']
lineage_genes = genesets[genesets_include]

# CONSTRUCT HEATMAP DATA
exec('heatmap_data = NDF_{}.fillna(0)'.format(subset_type))
binary_heatmap_data = heatmap_data>heatmap_data.quantile(q = 0.75,axis=0)
binary_heatmap_data = binary_heatmap_data.astype(int)

#AEC2_markers = [gene for gene in lineage_genes['AEC2_ME18p5_GENES_LG'].values if gene not in [nan]]#['SFTPB','SFTPC','SFTPD','LAMP3','SLC34A2','ABCA3','AQP3']
AEC2_markers = set(['SFTPA','SFTPB','SFTPC','SFTPD','LAMP3','SLC34A2','ABCA3','AQP3'])
AEC2_markers = set([gene for gene in AEC2_markers if gene in heatmap_data.columns])

#AEC1_markers = [gene for gene in lineage_genes['AEC1_ME18p5_GENES_LG'].values if gene not in [nan]]#['PDPN','HOPX','AGER','CAV1','SEMA3B']
AEC1_markers = set(['PDPN','HOPX','AGER','CAV1','SEMA3B'])
AEC1_markers = set([gene for gene in AEC1_markers if gene in heatmap_data.columns])
AEC1_markers = AEC1_markers-AEC2_markers

#club_markers = [gene for gene in lineage_genes['CLUB_ME18p5_GENES_LG'].values if gene not in [nan]]#['PDPN','HOPX','AGER','CAV1','SEMA3B']
club_markers = set(['SCGB1A1','SCGB3A1','SCGB3A2','CYP2F1'])
club_markers = set([gene for gene in club_markers if gene in heatmap_data.columns])
club_markers = club_markers-AEC1_markers-AEC2_markers

#ciliated_markers = [gene for gene in lineage_genes['CILIATED_ME18p5_GENES_LG'].values if gene not in [nan]]#['PDPN','HOPX','AGER','CAV1','SEMA3B']
ciliated_markers = set(['FOXJ1','CETN2','TUB1A1','TEKT1','CCDC78','DNAF1'])#
ciliated_markers = set([gene for gene in ciliated_markers if gene in heatmap_data.columns])
ciliated_markers = ciliated_markers-club_markers-AEC1_markers-AEC2_markers

#basal_markers = [gene for gene in lineage_genes['LECTIN+KRT5+BASAL'].values if gene not in [nan]]#['PDPN','HOPX','AGER','CAV1','SEMA3B']
basal_markers = set(['KRT14', 'KRT31', 'APOE','EMP3','APOC1','ARHGD1B','CD44'])
basal_markers = set([gene for gene in basal_markers if gene in heatmap_data.columns])
basal_markers = basal_markers-ciliated_markers-club_markers-AEC1_markers-AEC2_markers

#PNEC_markers = [gene for gene in lineage_genes['PNECS'].values if gene not in [nan]]#['PDPN','HOPX','AGER','CAV1','SEMA3B']
PNEC_markers = set(['UCHL1','ASCL1','CALCA'])
PNEC_markers = set([gene for gene in PNEC_markers if gene in heatmap_data.columns])
PNEC_markers = PNEC_markers-basal_markers-ciliated_markers-club_markers-AEC1_markers-AEC2_markers

#AEP_markers = [gene for gene in lineage_genes['AEP_MORRISEY_2FC'].values if gene not in [nan]]#['PDPN','HOPX','AGER','CAV1','SEMA3B']
AEP_markers = set(['TM4SF1','MUC5B','LRP5'])
AEP_markers = set([gene for gene in AEP_markers if gene in heatmap_data.columns])
AEP_markers = AEP_markers-PNEC_markers-basal_markers-ciliated_markers-club_markers-AEC1_markers-AEC2_markers

all_markers = list(AEC2_markers)+list(AEC1_markers)+list(club_markers)+list(ciliated_markers)+list(basal_markers)+list(PNEC_markers)+list(AEP_markers)

binary_heatmap_data = binary_heatmap_data[all_markers]

a1 = {key : 'AEC1' for key in AEC1_markers}
a2 = {key : 'AEC2' for key in AEC2_markers}
a3 = {key : 'CLUB' for key in club_markers}
a4 = {key : 'CILIATED' for key in ciliated_markers}
a5 = {key : 'BASAL' for key in basal_markers}
a6 = {key : 'NE' for key in PNEC_markers}
a7 = {key : 'AEP' for key in AEP_markers}
inverted_marker_genes_dict = {**a1, **a2,**a3,**a4,**a5,**a6,**a7} 

print(np.unique(binary_heatmap_data.index.get_level_values('CELL_TYPE')))

lineage_color_lut = {'AEC1':'#FF66FF',
                         'AEC2':'#6600CC',
                         'AEP':'#9ACD32', 
                         'BASAL':'#0000FF',
                         'CILIATED':'#3399FF',
                         'CLUB':'#FF8800',
                         'NE':'#FFD700',
                         'PROGENITOR TF':'#E6E6FA'}
['AEC1' 'AEC2' 'AEP/MUCINOUS' 'CILIATED' 'CLUB' 'KRT5+BASAL' 'PNEC'
 'STEM/PROGENITOR']

Fraction AEC1/AEC2 markers expressed in normal vs. mixed lineage tumor cells

In [71]:
# GENERATE AXIS PER VIOLIN PLOT
fraction_cell_type_array = pd.DataFrame()

for cell_index in np.arange(binary_heatmap_data.shape[0]):

    current_cell = binary_heatmap_data.ix[cell_index,:]
    detected_genes = current_cell[current_cell==1].index
    detected_cell_types = list(detected_genes.map(inverted_marker_genes_dict))

    fraction_cell_type = pd.DataFrame(columns = ['Cell Type', 'Gene Counts'])
    for cell_type in np.unique(detected_cell_types):
        fraction_cell_type = fraction_cell_type.append({'Cell Type' :cell_type, 'Gene Counts': detected_cell_types.count(cell_type)}, ignore_index = True)
    
    fraction_cell_type = fraction_cell_type.set_index('Cell Type').T
    fraction_cell_type =fraction_cell_type.div(fraction_cell_type.sum(axis=1),axis=0)
    
    all_cell_types = ['AEC1','AEC2','AEP','BASAL','CILIATED','CLUB','NE']
    missing_cell_types = [cell_type for cell_type in all_cell_types if cell_type not in list(fraction_cell_type.columns)]
    for missing_type in missing_cell_types:
        fraction_cell_type[missing_type]=0 
    fraction_cell_type = fraction_cell_type.sort_index(axis=1)
    fraction_cell_type_array = fraction_cell_type_array.append(fraction_cell_type)
fraction_cell_type_array.index = binary_heatmap_data.index    
In [72]:
color_lut = {'AEC1':'#FF66FF',
                     'AEC2':'#6600CC',
                     'AEP/MUCINOUS':'#9ACD32', 
                     'KRT5+BASAL':'#0000FF',
                     'CILIATED':'#3399FF',
                     'CLUB':'#FF8800',
                     'PNEC':'#FFD700',
                     'STEM/PROGENITOR':'#E6E6FA'}

# XY Parameters
plt.figure(figsize = (5,5))
gs1 = gridspec.GridSpec(1,1)
gs1.update(wspace=0.0, hspace=0.05) # set the spacing between axes. 

sns.set_style("white")
xaxis_label = 'AEC1'
yaxis_label = 'AEC2'
x = fraction_cell_type_array[xaxis_label]
y = fraction_cell_type_array[yaxis_label]
cell_type = fraction_cell_type_array.index.get_level_values('CELL_TYPE')
phenograph_class = fraction_cell_type_array.index.get_level_values('PHENOGRAPH_CLASS')
source = fraction_cell_type_array.index.get_level_values('Meta-Source')
c = fraction_cell_type_array.index.get_level_values('CELL_TYPE').map(color_lut)

ax1 = plt.subplot(gs1[0])

plot_cell= "STEM/PROGENITOR"
pcs = [0,1] # SELECTING CELLS FROM PHENOGRAPH CLASS 0 AND 1, WHICH SHOW AEC1-AEC2 MIXING
ax = sns.kdeplot(x[(cell_type==plot_cell)*(source=='TUMOR')*([pc in pcs for pc in phenograph_class])],
                 y[(cell_type==plot_cell)*(source=='TUMOR')*([pc in pcs for pc in phenograph_class])], 
                 n_levels=10, shade=True, cmap = 'Greys',shade_lowest=False, alpha = 1,
                 )
for path in ax.collections[-7].get_paths():
    coords = path.vertices
    ax.plot(coords[:,0],coords[:,1],color_lut[plot_cell], lw = 1.2)       
plt.xlim((-0.1,1))
plt.ylim((-0.1,1))

plot_cell= yaxis_label
ax = sns.kdeplot(x[(cell_type==plot_cell)*(source=='NOR')],y[(cell_type==plot_cell)*(source=='NOR')], 
                 n_levels=10, shade=True, cmap = 'Purples',shade_lowest=False, alpha = 0.5,
                 )
for path in ax.collections[-7].get_paths():
    coords = path.vertices
    ax.plot(coords[:,0],coords[:,1],color_lut[plot_cell], lw = 1.2)
        
plt.xlim((-0.1,1))
plt.ylim((-0.1,1))

plot_cell= xaxis_label
ax = sns.kdeplot(x[(cell_type==plot_cell)*(source=='NOR')],y[(cell_type==plot_cell)*(source=='NOR')], 
                 n_levels=10, shade=True, cmap = 'PuRd',shade_lowest=False,alpha = 0.5,
                 )
for path in ax.collections[-7].get_paths():
    coords = path.vertices
    ax.plot(coords[:,0],coords[:,1],color_lut[plot_cell], lw = 1.2)
#plt.gca().axes.get_xaxis().set_visible(False)              
plt.xlim((-0.1,1))
plt.ylim((-0.1,1))

ax.spines['top'].set_linewidth(0.5)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_linewidth(0.5)
ax.spines['bottom'].set_linewidth(0.5)
ax.spines['left'].set_linewidth(0.5)
ax.spines['right'].set_visible(False)

#ax1 = plt.subplot(gs1[1])

# SAVE FIGURE
figure_label = '_{}_lineageMarkersGTQ75_fraction_PC01_x{}_y{}_overlaid'.format(subset_type,xaxis_label,yaxis_label) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_EPITHELIAL_NOR_TUMOR_lineageMarkersGTQ75_fraction_PC01_xAEC1_yAEC2_overlaid

Phenotypic Volume across variable, lineage-specific genes

Select variable, lineage-specific genes

In [73]:
#### IDENTIFY VARIABLY EXPRESSED GENES DETECTED IN > 10 CELL (SINGLE CELL UN-IMPUTED DATA) ####
exec('QUERY = NDF_{}'.format(subset_type))

# FIT BINOMIAL DISTRIBUTION AND FILTER BASED ON MEAN/STD OF SECOND
plt.figure(figsize = (10,2))
gs1 = gridspec.GridSpec(1, 2)
gs1.update(wspace=0.25, hspace=0.7) # set the spacing between axes. 

# (1) PLOT LOG NUMBER OF CELLS CONTRIBUTING TO EACH GENE
num_cells_per_gene = np.log(np.sum(QUERY.values > 0,axis=0))
num_cells_per_gene[(np.isinf(num_cells_per_gene)) | (np.isnan(num_cells_per_gene))] = 0
keep_genes1 = pd.Series(data=num_cells_per_gene>=1,index = QUERY.columns) # GENES MUST BE DETECTED IN AT LEAST 10 CELLS

ax = plt.subplot(gs1[0])
bins = np.linspace(num_cells_per_gene.min(), num_cells_per_gene.max()*0.95, 20)
plt.hist(num_cells_per_gene, bins, alpha=0.5, label='keep')
plt.hist(num_cells_per_gene[keep_genes1], bins, alpha=1, label='retain',color = 'r')
    
ax.set_facecolor('white')
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('Gene Filter 1: Remove Genes Singletons \n(Log. # Expressing Cells)')
plt.grid(True)
sns.despine()

# (2) PLOT GENE DISPERSION
pseudocount = 0.0001
QUERY = np.log10(QUERY+pseudocount)

x = QUERY.mean(axis=0, skipna=False)
y = QUERY.var(axis=0, skipna=False)
z = np.polyfit(x, y, 6)
p = np.poly1d(z)
yfit = p(x)

# The biological component for each gene is defined as the difference between its total variance and the fitted value of the trend
biological_variance = y-yfit
keep_genes2 = biological_variance>biological_variance.mean()

ax = plt.subplot(gs1[1])
plt.scatter(x,y, c = 'g',s=1)
plt.scatter(x,yfit,s = 1,c = 'k')
plt.scatter(x[keep_genes2],y[keep_genes2],c='r',s=1)
plt.xlabel('mean log10(expression)')
plt.ylabel('variance log10(expression)')

# SELECT VARIABLE GENES EXPRESSED IN > 10 CELLS
retain_bool = keep_genes1*keep_genes2==True
goi = list(retain_bool[retain_bool].index)
print(len(goi))

# SAVE FIGURE
figure_label = '_{}_GENE_FILTERING'.format(subset_type) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
6591
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_EPITHELIAL_NOR_TUMOR_GENE_FILTERING
In [74]:
# LOAD LINEAGE-SPECIFIC GENE SETS
filename = 'lung_epithelial_lineage_signatures'
path_to_genesets = DATA_PATH+'{}.csv'.format(filename)
genesets = pd.read_csv(path_to_genesets,header='infer')

genesets_include = ['AEC1_ME18p5_GENES_LG','AEC2_ME18p5_GENES_LG','CLUB_ME18p5_GENES_LG','CILIATED_ME18p5_GENES_LG','LECTIN+KRT5+BASAL','PNECS','MUCINOUS','AEP_MORRISEY_2FC']

lineage_genes = np.unique(genesets[genesets_include].values.ravel().tolist())
lineage_genes = [gene for gene in lineage_genes if gene in list(QUERY.columns)]
variable_lineage_genes = [gene for gene in lineage_genes if gene in goi]
print(len(lineage_genes))
print(len(variable_lineage_genes))
1560
833

Compute the Phenotypic Volume

In [75]:
# COMPUTE LINEAGE PHENOTYPIC VOLUME
n = 500
reps = np.arange(50) # number of times to downsample cells
gene_subset = variable_lineage_genes # genes used in covariance calculation
print(len(gene_subset))

# GENE-GENE COVARIANCE WITHIN NORMAL SUBSET
master_groupby = 'Meta-Source'
class_selection = ['NOR']
ix = [ind for ind,name in enumerate(QUERY.index.names) if name==master_groupby][0]
logvolume_normal = np.zeros(len(reps))

for rep in reps:
    SUBSET = QUERY.select(lambda x: x[ix] in class_selection)[gene_subset].sample(n=n,replace=True)
    GENE_COV = SUBSET.cov(min_periods=None)
    eigenvalues = np.real(np.linalg.eigvals(GENE_COV))
    logvolume = np.float(np.sum(0.5*np.log10(eigenvalues[eigenvalues>0]**2))/len(eigenvalues))
    logvolume_normal[rep] = logvolume
    
# GENE-GENE COVARIANCE WITHIN TUMOR SUBSET
master_groupby = 'Meta-Source'
class_selection = ['TUMOR']
ix = [ind for ind,name in enumerate(QUERY.index.names) if name==master_groupby][0]
logvolume_tumor = np.zeros(len(reps))

for rep in reps: 
    SUBSET = QUERY.select(lambda x: x[ix] in class_selection)[gene_subset].sample(n=n,replace=True)
    GENE_COV = SUBSET.cov(min_periods=None)
    eigenvalues = np.real(np.linalg.eigvals(GENE_COV))
    logvolume = np.float(np.sum(0.5*np.log10(eigenvalues[eigenvalues>0]**2))/len(eigenvalues))
    logvolume_tumor[rep] = logvolume

# WRITE RESULTS TO DATAFRAME
boxplot_data_variable_lineage = pd.DataFrame()
boxplot_data_variable_lineage['Meta-Source'] = ['NOR']*len(logvolume_normal)+['TUMOR']*len(logvolume_tumor)
boxplot_data_variable_lineage['Log Phenotypic Volume'] = np.concatenate([logvolume_normal,logvolume_tumor],axis = 0)

# VIOLIN PLOT
fig = plt.figure(figsize = (2,8))
ax = plt.gca()
palette = dict(zip(['TUMOR','NOR'],[[0,0,0],[230/255,230/255,250/255]]))   

# Plot violin plot
g = sns.boxplot(x='Meta-Source', y="Log Phenotypic Volume" ,
                   data=boxplot_data_variable_lineage,palette = palette,notch = True, 
                   linewidth = 1, ax = ax, order = ['NOR','TUMOR']);
g.set_ylabel("Log Lineage Phenotypic Volume",fontsize=10)
g.set_xlabel(" ",fontsize=10,rotation = 90)
g.tick_params(labelsize=10)
g.set_xticklabels(ax.get_xticklabels(),rotation=90)
sns.despine()
ax.spines['bottom'].set_linewidth(1)
ax.spines['left'].set_linewidth(1)

# STATISTIC
T = boxplot_data_variable_lineage.loc[boxplot_data_variable_lineage['Meta-Source'].isin(['TUMOR'])]['Log Phenotypic Volume'].values
N = boxplot_data_variable_lineage.loc[boxplot_data_variable_lineage['Meta-Source'].isin(['NOR'])]['Log Phenotypic Volume'].values
print(stats.mannwhitneyu(T, N))

# SAVE FIGURE
figure_label = '{}_LogVariableLineagePhenotypicVolume_n{}_reps{}'.format(subset_type,n,len(reps)).replace(' ','_')
fn = FIG_output_stem  + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
833
MannwhitneyuResult(statistic=0.0, pvalue=3.533035965194466e-18)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDEPITHELIAL_NOR_TUMOR_LogVariableLineagePhenotypicVolume_n500_reps50

The merged normal lung, primary tumour and metastases

In [76]:
subset_type = 'EPITHELIAL_NOR_TUMOR_MET'

Load data subset

In [77]:
# LOAD DATA
# RAW COUNTS (Cells x Genes)
exec('DF_{} = h5_data.load(\'/DF_{}\')'.format(subset_type,subset_type))

# MEDIAN LIBRARY-SIZE NORMALIZED COUNTS (Cells x Genes)
exec('NDF_{} = h5_data.load(\'/NDF_{}\')'.format(subset_type,subset_type))

# MAGIC IMPUTED AND NORMALIZED DATA (Cells x Genes)
exec('INDF_{} = h5_data.load(\'/INDF_{}\')'.format(subset_type,subset_type))

# CELLS X DIMENSIONS (including principal components, diffusion components, ect.)
exec('DIMENSIONS_{} = h5_data.load(\'/DIMENSIONS_{}\')'.format(subset_type,subset_type))

# RANKED DIFFERENTIALLY EXPRESSED GENES BY MAST (EACH CELL TYPE VS. ALL OTHER CELLS)
exec('GENE_RANK_MAST_{} = h5_data.load(\'/GENE_RANK_MAST_{}\')'.format(subset_type,subset_type))

Visualize Phenograph clusters and lineage assignments

In [78]:
dimtype = 'ForceDirected'
meta = 'CELL_TYPE'
exec('QUERY = INDF_{}'.format(subset_type))
exec('DIM = DIMENSIONS_{}'.format(subset_type))
x = DIM['{}0'.format(dimtype)]
y = DIM['{}1'.format(dimtype)]

# SOURCE WITH TREATMENT INFO
Rx = [name[len(name)-3:].replace('_','').replace('T4','TR').replace('UTR','').replace('TR','Tx') for name in QUERY.index.get_level_values('Legend')]
Source = QUERY.index.get_level_values('Meta-Source')
Source_Rx = Source+Rx

# XY Parameters
plt.figure(figsize = (15,5))
gs1 = gridspec.GridSpec(1,3)
gs1.update(wspace=0.01, hspace=0.1) # set the spacing between axes. 
dot_size = 5

xmax = np.abs(x).max()
ymax = np.abs(y).max()
axis_min = -ceil(ceil(max(xmax,ymax)/10)*10)
axis_max = ceil(ceil(max(xmax,ymax)/10)*10)

# SAMPLE ID: LEGEND
ax1 = plt.subplot(gs1[0])
seqc.plot.scatter.categorical(x, y, c=Source_Rx, 
                              cmap=CM_SOURCE_RX,
                              legend_kwargs={'ncol': 1, 'borderaxespad':0.,
                                            'bbox_to_anchor':(1, 0)}, 
                              s=dot_size, ax=ax1);
sns.despine()
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
sns.despine()

# PHENOGRAPH CLUSTERS
ax2 = plt.subplot(gs1[1])
g = seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('PHENOGRAPH_CLASS'), 
                                  cmap= CM_CLASS_ALL_EP, 
                                  legend_kwargs={'ncol': 3, 'borderaxespad':0.,
                                            'bbox_to_anchor':(0.75, 0)},
                                  s=dot_size,randomize=True, ax = ax2);
ax2.tick_params(labelsize=14)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax2.axis('off')

# CELL TYPES ANNOTATED PREVIOUSLY IN MERGED NORMAL AND PRIMARY TUMOR EPITHELIUM
ax3 = plt.subplot(gs1[2])
g = seqc.plot.scatter.categorical(x, y, c=QUERY.index.get_level_values('CELL_TYPE'), 
                                  cmap= CM_LINEAGE, 
                                  legend_kwargs={'ncol': 1, 'borderaxespad':0.,
                                            'bbox_to_anchor':(1, 0)},
                                  s=dot_size,randomize=True, ax = ax3);
h,l=g.get_legend_handles_labels()
ax3.tick_params(labelsize=14)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax3.axis('off')

# SAVE FIGURE
figure_label = '_{}_LEGEND_PHENOGRAPH_CELLTYPE_{}'.format(subset_type,dimtype)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_EPITHELIAL_NOR_TUMOR_MET_LEGEND_PHENOGRAPH_CELLTYPE_ForceDirected

Query marker expression by Phenograph cluster

Load canonical marker genes lists per cell type

In [79]:
# LOAD GENESETS
filename = 'lung_epithelial_lineage_signatures'
path_to_genesets = DATA_PATH+'{}.csv'.format(filename)
genesets = pd.read_csv(path_to_genesets,header='infer')

genesets_include = ['AEP_MORRISEY_2FC','AEC1_ME18p5_GENES_LG','AEC2_ME18p5_GENES_LG','CLUB_ME18p5_GENES_LG','CILIATED_ME18p5_GENES_LG','LECTIN+KRT5+BASAL','PNECS','MUCINOUS']
lineage_genes = np.unique(genesets[genesets_include].values.ravel().tolist())

lut = {}
for geneset in genesets_include:
    current_genes = [gene for gene in np.unique(genesets[geneset].values.ravel().tolist()) if gene != 'nan']
    current_label = [geneset]*len(current_genes)
    current_lut = dict(zip(current_genes,current_label))
    lut = dict(lut, **current_lut)

Dot plot of un-imputed data

In [80]:
# SPECIFY GROUPBY PARAMETER AND ASSOCIATED COLORMAP
meta='CELL_TYPE'
FLATUI_PLOT = FLATUI_LINEAGE

# SPECIFY DATA AND GROUP CELLS BY THIS PARAMETER
exec('QUERY = NDF_{}'.format(subset_type))
     
# RANK GROUPED CELLS BY CATEGORIES (ORDERED)
categories = ['AEC2','AEC1','CLUB','CILIATED','KRT5+BASAL','PNEC','AEP/MUCINOUS','STEM/PROGENITOR','UNKNOWN']

marker_genes_dict = {
                     'AEC2': ['SFTPA','SFTPB','SFTPC','SFTPD','LAMP3','SLC34A2','ABCA3','AQP3'],
                     'AEC1': ['PDPN','HOPX','AGER','CAV1','SEMA3B'],
                     'CLUB': ['SCGB1A1','SCGB3A1','SCGB3A2','CYP2F1'],
                     'CILIATED':['FOXJ1','CETN2','TUB1A1','TEKT1','CCDC78','DNAF1'],
                     'BASAL':['KRT14', 'APOE','EMP3','APOC1','CD44'],
                     'NE': ['UCHL1','ASCL1','CALCA'],
                     'AEP': ['TM4SF1','MUC5B','LRP5'],
                     'PROGENITOR TF': ['SOX2','SOX9','ID2','NKX2-1']
                    }
# SPECIFY ORDER OF PLOT GENES
var_group_labels = ['AEC2','AEC1','CLUB','CILIATED','BASAL','NE','AEP','PROGENITOR TF']

# CONVERT CATEGORIZED GENE DICTIONARY TO MARKER_GENES WITH POSITION ANNOTATED
tmp = pd.DataFrame()
for gene_group in var_group_labels:
    genes = marker_genes_dict[gene_group]
    genes = [gene for gene in genes if gene in QUERY.columns]
    labels = [gene_group] * len(genes)
    current = pd.DataFrame({'labels':labels,'genes':genes})
    tmp = tmp.append(current)
tmp.index = np.arange(len(tmp))
marker_genes = tmp['genes'].values

var_group_positions = []
for gene_group in var_group_labels:
    cpositions = list(tmp[tmp['labels'] ==gene_group].index)
    var_group_positions.append((cpositions[0],cpositions[len(cpositions)-1]))

# DOT PLOT OF GENES LABELED BY CATEGORY
fig = plt.figure(figsize=(5,10))
sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})

dotplot(QUERY[marker_genes], marker_genes, categories, groupby=meta, use_raw=None, log=False,
        expression_cutoff=0, mean_only_expressed=True, color_map='Reds', dot_max=0.75,
        dot_min=0.10, figsize=None, dendrogram=False, gene_symbols=None,
        var_group_positions=var_group_positions, standard_scale = 'var', smallest_dot=0,
        var_group_labels=var_group_labels, var_group_rotation=90, layer=None, show=None,
        save=None)


# COLORMAP LUT BASED ON QUERY & MET
ind = categories #QUERY.index.get_level_values(meta)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(np.unique(ind), colors[cix,:]))
row_colors = pd.Series(ind,index = ind).map(lut)

# SAVE FIGURE
figure_label = '_{}_dotPlot_markerGenes_by{}'.format(subset_type,meta) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_EPITHELIAL_NOR_TUMOR_MET_dotPlot_markerGenes_byCELL_TYPE
<Figure size 360x720 with 0 Axes>

Patient mixing within each Phenograph cluster

In [81]:
# Patient mixing within each cluster
meta = 'PHENOGRAPH_CLASS'
exec('QUERY = INDF_{}'.format(subset_type))
FLATUI_PLOT = FLATUI_CLASS_ALL_EP

cell_types = np.unique(QUERY.index.get_level_values(meta))

# To evaluate patient mixing within each cell type, we: 
# (1) randomly sampling ncells per cell type
# (2) Compute the Shannon Diversity index based on sample ID within the sampled subset. 
# (3) Repeat nreps times to achieve a range of entropy values per cell type. 
ncells = 100 # randomly sample this number of cells within each cell type
nreps = 100 # repeat sampling this many times, scaled to cover sample size.

# GENERATE COLORMAP
# Palatte for Class METADATA
ind = QUERY.index.get_level_values(meta)
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))

# ENTROPY FOR CELL SUBSETS
H = pd.DataFrame()

for cell_type in cell_types:
    
    # SUBSET CELLS BELONGING TO SPECIFIC CELL TYPE
    CELL_TYPE_SUBSET = QUERY.select(lambda x: x[[ind for ind, name in enumerate(QUERY.index.names) if name==meta][0]] in [cell_type])

    H_sample = []

    for rep in np.arange(nreps):
        # Randomly sample 100 Cells from cluster
        sampling = CELL_TYPE_SUBSET.sample(n=ncells,replace=True)
    
        # Compute probability of cells sampled for individual patients
        labels = sampling.index.get_level_values('Legend')
        probs = [np.mean(labels == c) for c in set(labels)]
        # Compute entropy of distribution across patients
        H_sample.append(np.sum(-p * np.log(p) for p in probs))
    
    # Results for cell type 
    H = H.append(pd.DataFrame({'Entropy across patients': H_sample, 'Cell Type':[cell_type]*nreps}))
    
# GROUPED BOXPLOT WITH DATA POINTS OVERLAYED
fig = plt.figure(figsize = (8,3))
ax1 = plt.gca()
sns.set_style("white")

# Rank boxplots by entropy
plot_order = H.set_index('Cell Type').groupby('Cell Type').mean().sort_values(by=['Entropy across patients']).index

g = sns.boxplot(x="Cell Type", y="Entropy across patients",data=H, palette=palette,
                fliersize = 4, showmeans=False,linewidth = 1, order = plot_order)
g.set_ylabel("Entropy across patients",fontsize=10,fontname = 'Arial')
g.set_xticklabels(plot_order,rotation=90,fontname = 'Arial',fontsize = 10)
g.tick_params(labelsize=10)
sns.despine()

# SAVE FIGURE
figure_label = '{}_PATIENT_ENTROPY_PER_{}'.format(subset_type,meta).replace(' ','_')
fn = FIG_output_stem  + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDEPITHELIAL_NOR_TUMOR_MET_PATIENT_ENTROPY_PER_PHENOGRAPH_CLASS

Metastatic clusters ranked by lung epithelial development

Select metastatic clusters: > 10% of cells in cluster from metastases

In [82]:
# Fraction detected per metastasis
meta = 'Meta-Source'
exec('tmp = INDF_{}.groupby(level=[\'PHENOGRAPH_CLASS\', \'{}\'], axis=0).size().unstack().fillna(0)'.format(subset_type,meta))
tmp = tmp.div(tmp.sum(axis=1),axis=0)
cluster_ind = list(tmp[tmp['MET']> 0.10].index)
print(cluster_ind)
[0, 3, 5, 6, 7, 9, 10, 13, 14, 15, 20]

Rank metastatic clusters by average expression of lung epithelial development signature and query expression of key genesets

In [83]:
# LOAD KEY GENESETS
filename = 'stem_cell_signatures_merged'
path_to_genesets = DATA_PATH+'{}.csv'.format(filename)
genesets = pd.read_csv(path_to_genesets,header='infer')
In [84]:
# STACKED RANKED BOXPLOTS

meta = 'PHENOGRAPH_CLASS'
FLATUI_PLOT = FLATUI_CLASS_ALL_EP

datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))

rank_by = 'GO_LUNG_EPITHELIUM_DEVELOPMENT'
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in set(QUERY.columns)]
genes = np.unique([x for x in genes if str(x) != 'NAN'])

detected_genes = list(set(genes).intersection(set(QUERY.columns)))
vals = QUERY[detected_genes]
SCORE = np.nanmean(vals,axis=1)

violin_data = []
for ind,v in enumerate(SCORE):
    violin_data.append({'gene': title, 'Z-normalized Expression': v,
                        meta:QUERY.index.get_level_values(meta)[ind]}) 
violin_data = pd.DataFrame(violin_data)  
order = list(violin_data[violin_data['gene']==title].groupby(meta).median().sort_values('Z-normalized Expression',ascending = False).index)
met_order = [val for val in order if val in cluster_ind][::-1]

# GENERATE COLORMAP
# Palatte for Class METADATA
ind = QUERY.index.get_level_values(meta)
# CONVERT HEX TO RGB (FLATUI_CLASS)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
palette = dict(zip(np.unique(ind), colors[cix,:]))

plot_order = ['GO_LUNG_EPITHELIUM_DEVELOPMENT','ASC_SMITH_CELL2018',
              'HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION',
              'CELLULAR_MORPHOGENESIS_DURING_DIFFERENTIATION',
              'AEP_MORRISEY_2FC']

# (0,0)  Plot violin plot
N = len(plot_order)

# GENERATE AXIS PER VIOLIN PLOT
fig = plt.figure(figsize = (2,N*1.25))
sns.set_style("white")

for iii,title in enumerate(plot_order):
    # SPECIFY AXIS
    ax1 = plt.subplot2grid((N, 1), (iii, 0), colspan=1)
    
    # GET GENES
    genesets_include = [title]
    genes = np.unique(genesets[genesets_include].values.ravel().tolist())
    genes = [gene for gene in genes if gene in set(QUERY.columns)]
    genes = np.unique([x for x in genes if str(x) != 'NAN'])
    detected_genes = list(set(genes).intersection(set(QUERY.columns)))
    vals = QUERY[detected_genes]
    SCORE = np.nanmean(vals,axis=1)
    
    # Format data structure for violin plot
    violin_data = []
    for ind,v in enumerate(SCORE):
        violin_data.append({'gene': title, 'Z-normalized Expression': v,
                            meta:QUERY.index.get_level_values(meta)[ind]}) 
    violin_data = pd.DataFrame(violin_data)  

    g = sns.boxplot(x="gene", y="Z-normalized Expression", hue=meta,data=violin_data, palette=palette,notch = True, 
                    fliersize = 1, showmeans=False,linewidth = 0.5, hue_order = met_order)

    g.set_ylabel(" ",fontsize=10)
    g.set_xlabel(" ",fontsize=10,rotation = 90)
    g.tick_params(labelsize=10)
    sns.despine()
    ax1.legend_.remove()
    vs = violin_data.ix[[ind for ind,val in enumerate(violin_data[meta]) if val in met_order]]['Z-normalized Expression']
    ax1.set(ylim=(vs.min()*0.95, vs.max()*1.05))

    plt.title("{}".format(title),fontsize=6,fontname = 'Arial',loc = 'left',
              rotation = 'horizontal', ha = 'left', pad = 1.5)
    plt.xlabel('')

    ax1.spines['top'].set_linewidth(0.5)
    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_linewidth(0.5)
    ax1.spines['bottom'].set_linewidth(0.5)
    ax1.spines['left'].set_linewidth(0.5)
    ax1.spines['right'].set_visible(False)
    
    ax1.tick_params(axis='x', bottom=False, labelbottom=False)
    
print(met_order)
        
# SAVE FIGURE
figure_label = '_{}_{}_stackedViolin_markerGenes_by{}_LungEp_mean'.format(subset_type,datatype,meta) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
[3, 0, 20, 5, 13, 7, 14, 9, 15, 6, 10]
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_EPITHELIAL_NOR_TUMOR_MET_INDF_EPITHELIAL_NOR_TUMOR_MET_stackedViolin_markerGenes_byPHENOGRAPH_CLASS_LungEp_mean

Robustness of cluster ranking to individual genes in lung epithelial development signature

In [85]:
# ROBUSTNESS OF RANKING TO INDIVIDUAL GENES:
# STACKED RANKED BOXPLOTS
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))

rank_by = 'GO_LUNG_EPITHELIUM_DEVELOPMENT'
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in list(QUERY.columns)]
genes = np.unique([x for x in genes if str(x) != 'NAN'])
detected_genes = list(set(genes).intersection(set(QUERY.columns)))

print(len(detected_genes))
34
In [86]:
# LEAVE ONE OUT ANALYSIS: RANK METASTATIC CLUSTERS BY GO_LUNG_EPITHELIUM_DEVELOPMENT_SIGNATURE less one gene 
# TO EVALUATE ROBUSTNESS OF CLUSTERING TO INDIVIDUAL GENE LEVELS
array_met_order = np.zeros((len(detected_genes),len(met_order)))
for ind2 in np.arange(len(detected_genes)):
    current_genes = deepcopy(detected_genes)
    dropped_value = current_genes.pop(ind2)
    
    vals = QUERY[current_genes]
    SCORE = np.nanmean(QUERY[current_genes],axis=1)

    violin_data = []
    for ind,v in enumerate(SCORE):
        violin_data.append({'gene': title, 'Score': v,
                            meta:QUERY.index.get_level_values(meta)[ind]}) 
    violin_data = pd.DataFrame(violin_data)  
    order = list(violin_data[violin_data['gene']==title].groupby(meta).median().sort_values('Score',ascending = False).index)
    met_order = [int(val) for val in order if val in cluster_ind][::-1]
    array_met_order[ind2,:] = met_order

leave_one_out_gene_ranking = pd.DataFrame(data = array_met_order,index = detected_genes)
In [87]:
fig = plt.figure(figsize=(2,5))
ax1 = fig.add_axes([0,0,1,1])

for y_ind in np.arange(leave_one_out_gene_ranking.shape[0]):
    row_vals = leave_one_out_gene_ranking.ix[y_ind].values
    row_colors = pd.Series(row_vals).map(palette)

    print(leave_one_out_gene_ranking.ix[y_ind].name)
    x = 0
    y = y_ind/(leave_one_out_gene_ranking.shape[0])

    for c in row_colors: 
        pos = (x/len(row_vals),y)
        ax1.add_patch(patches.Rectangle(pos,1,1 / (len(row_vals)), color=c,
                                        label=leave_one_out_gene_ranking .ix[y_ind].name))
    
        if x >= len(row_vals)-1:
            y += 1
            x = 0
        else:
            x += 1

# SAVE FIGURE
figure_label = '_{}_leave_one_out_rankedmean_wAGR2alone'.format(subset_type,meta) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
LTA4H
SOX9
AGR2
KLF2
MAP2K1
ASCL1
WNT7B
SPDEF
ERRFI1
MAP2K2
NUMB
EYA1
BMP4
AIMP2
THRA
GPSM2
RBPJ
FGF10
FOXJ1
NUMA1
GATA6
SHH
ADAMTSL2
FOXA1
STRA6
NFIB
THRB
YAP1
GRHL2
SAV1
NKX2-1
HOXA5
PKD1
CREB1
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_EPITHELIAL_NOR_TUMOR_MET_leave_one_out_rankedmean_wAGR2alone

Individual tumor cells ranked and clustered according to lung epithelial development genes

In [88]:
# LOAD GENESETS
filename = 'stem_cell_signatures_merged'
path_to_genesets = DATA_PATH+'{}.csv'.format(filename)
genesets = pd.read_csv(path_to_genesets,header='infer')

subset_type = 'EPITHELIAL_NOR_TUMOR_MET'
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))

subset_key = 'Meta-Source'
ix = [ind for ind, name in enumerate(QUERY.index.names) if name==subset_key][0]
QUERY = QUERY.select(lambda x: x[ix] in ['TUMOR','MET'])

# HEATMAP OF INDIVIDUAL CELLS RANKED BY LUNGE EP
rank_by = 'GO_LUNG_EPITHELIUM_DEVELOPMENT'
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in list(set(QUERY.columns))]
vals = QUERY[genes]

QUERY['LUNG_EPITHELIUM_RANK'] =np.nanmean(vals,axis=1)
QUERY['PROLIFERATION'] = np.nanmean(QUERY[['MCM2','PCNA','MKI67']],axis=1)

plot_genes = ['SOX17','HHEX','KRT5','VIM','SNAI2','BMP4','DKK1','SOX2','FGFR2','FOXA2','NKX2-1','WNT7B','SOX9','TM4SF1']#

mat = pd.DataFrame( data = zscore(QUERY.sort_values(by=['LUNG_EPITHELIUM_RANK'])[plot_genes],axis=0), 
                   columns = plot_genes, index = QUERY.index)
window = 20
mat = mat.rolling(window, win_type='triang',center = True).sum()
half_window = int(window/2)
for ind in np.arange(half_window):
    mat.iloc[ind] = mat.iloc[half_window]    
for ind in np.arange(half_window):
    mat.iloc[-ind] = mat.iloc[-half_window]

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(4,10))

# ADD MATRIX WITH LINEAGE NAMES
axmatrix = fig.add_axes([0.12,0.1,0.4,0.6])
colors = [(1,1,1), np.divide(tuple(hex('FF0000').rgb),255)]
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=CM_DIVERGING,vmin=-10,vmax=10)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 10,fontname='Arial')
axmatrix.grid(False)
axmatrix.spines['top'].set_visible(False)
axmatrix.spines['right'].set_visible(False)
axmatrix.spines['bottom'].set_visible(False)
axmatrix.spines['left'].set_visible(False)

# ROW1 COLORS
meta = 'DEVELOPMENT_PHENOGRAPH_CLASS'
ind1 = mat.index.get_level_values(meta)
lut = {'I-P': '#F0F0F0', 'I-Q': '#E0E0E0', 'II': '#A0A0A0', 'III': '#202020'}
row_colors1 = pd.Series(ind1).map(lut)

# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.08,0.1,0.035,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors1:
    pos = (x, y / len(row_colors1))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(row_colors1), color=c))
    if y >= len(row_colors1)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

# ADD DENDROGRAM
ax2 = fig.add_axes([0.00,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#808080')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')

# SAVE FIGURE
figure_label = '{}_RankedCells_Development_All_tumor_Cells_labeled'.format(subset_type).replace(' ','_')
fn = FIG_output_stem  + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDEPITHELIAL_NOR_TUMOR_MET_RankedCells_Development_All_tumor_Cells_labeled

Query NK ligand expression across ranked tumor cells in states: I-Q/II/III

In [89]:
# LOAD GENESETS
filename = 'stem_cell_signatures_merged'
path_to_genesets = DATA_PATH+'{}.csv'.format(filename)
genesets = pd.read_csv(path_to_genesets,header='infer')

# SUBSET ALL TUMOR CELLS
subset_type = 'EPITHELIAL_NOR_TUMOR_MET'
datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
subset_key = 'Meta-Source'
ix = [ind for ind, name in enumerate(QUERY.index.names) if name==subset_key][0]
QUERY = QUERY.select(lambda x: x[ix] in ['TUMOR','MET'])

# HEATMAP OF INDIVIDUAL CELLS RANKED BY LUNGE EP
rank_by = 'GO_LUNG_EPITHELIUM_DEVELOPMENT'
genesets_include = [rank_by]
genes = np.unique(genesets[genesets_include].values.ravel().tolist())
genes = [gene for gene in genes if gene in list(set(QUERY.columns))]
vals = QUERY[genes]

QUERY['LUNG_EPITHELIUM_RANK'] = np.nanmean(QUERY[genes],axis=1)
QUERY['PROLIFERATION'] = np.nanmean(QUERY[['MCM2','PCNA','MKI67']],axis=1)
QUERY['NK_ACT'] = np.nanmean(QUERY[['ULBP1','ULBP2','RAET1E','RAET1G','RAET1L','MICA','MICB']],axis=1)
QUERY['NK_INH'] = np.nanmean(QUERY[['HLA-A','HLA-B','HLA-C','B2M']],axis=1)

mat = zscore(QUERY.sort_values(by=['LUNG_EPITHELIUM_RANK'])[genes],axis=0)

plot_genes = ['SOX2','SOX9','PROLIFERATION','NK_INH','NK_ACT',
              'HLA-A','HLA-B','HLA-C','B2M',
             'ULBP1','ULBP2','RAET1E','RAET1G','RAET1L','MICA','MICB']

mat = pd.DataFrame( data = zscore(QUERY.sort_values(by=['LUNG_EPITHELIUM_RANK'])[plot_genes],axis=0), 
                   columns = plot_genes, index = QUERY.index)
window = 20
mat = mat.rolling(window, win_type='triang',center = True).sum()
half_window = int(window/2)
for ind in np.arange(half_window):
    mat.iloc[ind] = mat.iloc[half_window]    
for ind in np.arange(half_window):
    mat.iloc[-ind] = mat.iloc[-half_window]

# HERE WE ARE ONLY VISUALIZING CELLS ASSIGNED TO DEVELOPMENTAL STATES I-Q, II, II FOR SIMPLER VISUALIZATION
# PROLIFERATING TYPE I CELLS (STEM LIKE), ASSOCIATE WITH REGENERATIVE TYPE II CELLS
# MOUSE LCC CELLS ASSOCIATE WITH I-Q STATE
subset_key = 'DEVELOPMENT_PHENOGRAPH_CLASS'
ix = [ind for ind, name in enumerate(mat.index.names) if name==subset_key][0]
mat = mat.select(lambda x: x[ix] in ['I-Q','II','III'])

# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(6,10))

# ADD MATRIX WITH LINEAGE NAMES
axmatrix = fig.add_axes([0.12,0.1,0.4,0.6])
colors = [(1,1,1), np.divide(tuple(hex('FF0000').rgb),255)]
im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=CM_DIVERGING,vmin=-10,vmax=10)
labels = list(mat.columns)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.set_yticklabels([]);
xtick = plt.xticks(range(len(labels)), labels, rotation = 90, fontsize = 10,fontname='Arial')
axmatrix.grid(False)
axmatrix.spines['top'].set_visible(False)
axmatrix.spines['right'].set_visible(False)
axmatrix.spines['bottom'].set_visible(False)
axmatrix.spines['left'].set_visible(False)

# ROW1 COLORS
meta = 'DEVELOPMENT_PHENOGRAPH_CLASS'
ind1 = mat.index.get_level_values(meta)
lut = {'I-P': '#F0F0F0', 'I-Q': '#E0E0E0', 'II': '#A0A0A0', 'III': '#202020'}
row_colors1 = pd.Series(ind1).map(lut)

# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.08,0.1,0.035,0.6]) # [x0,y0,width,height]
x = 0
y = 0
for c in row_colors1:
    pos = (x, y / len(row_colors1))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(row_colors1), color=c))
    if y >= len(row_colors1)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

# ADD DENDROGRAM
sch.set_link_color_palette(['#808080', '#808080', '#808080', '#808080','#808080','#808080','#808080'])
ax2 = fig.add_axes([0.00,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#808080')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')

# SAVE FIGURE
figure_label = '{}_RankedCells_NKligands_All_tumor_Cells_labeled'.format(subset_type).replace(' ','_')
fn = FIG_output_stem  + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=400)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDEPITHELIAL_NOR_TUMOR_MET_RankedCells_NKligands_All_tumor_Cells_labeled

Mapping xenograft model of metastases to human metastatic states

Description

We profiled the transcriptomes of 14,821 single cells obtained from metastases that evolved from H2087-LCC cells xenograft in athymic mice with and without the selective pressure of NK cell surveillance. Observed metastatic states were mapped to the tumor lung epithelial development states observed in human patient metastasis (section 3) and their distributions were compared in the spontaneous vs. NK cell depleted macrometastases.

Specify read/write directories

In [90]:
h5_data_mouse = H5(DATA_PATH+'MOUSE_LUNG_ADENOCARCINOMA_METASTASIS_ANNOTATED.h5')
h5_data_mouse.ls()
['/DF_XENOGRAFT_METASTASES_NK', '/DF_XENOGRAFT_METASTASES_NK_DEPLETED', '/DIMENSIONS_XENOGRAFT_METASTASES_NK', '/DIMENSIONS_XENOGRAFT_METASTASES_NK_DEPLETED', '/GENE_RANK_MAST_XENOGRAFT_METASTASES_NK', '/GENE_RANK_MAST_XENOGRAFT_METASTASES_NK_DEPLETED', '/INDF_XENOGRAFT_METASTASES_NK', '/INDF_XENOGRAFT_METASTASES_NK_DEPLETED', '/NDF_XENOGRAFT_METASTASES_NK', '/NDF_XENOGRAFT_METASTASES_NK_DEPLETED']

Define colormaps

In [91]:
# --- DEFINE ALL COLORMAPS UPFRONT FOR UNIFORMITY ---
# CATEGORICAL: SAMPLE
FLATUI_SAMPLE_M = [
                 '994C00', #BLIneg
                 '663300', #BLIneg
                 
                 '9ACD32', #Early Lung
                 '6B8E23', #Early Lung
                 '006400', #Early Lung 
    
                 'FF9933', # Late Micro Lung Met   
                 'FF9933', # Late Micro Lung Met(b)
]

FLATUI_SOURCE_M = [
                 '006400', # Aggressive
                 '8B4513', # BLINEG
                 'FF9933', # Stable Micromet    
]

FLATUI_CLASS_M = [
                 '0000FF', 
                 'FF0000', 
                 'FF8C00', 
                 'FFD700', 
                 '808000', 
                 '9ACD32', 
                 '006400', 
                 '20B2AA', 
                 '2F4F4F', 
                 '00BFFF', 
                 '191970', 
                 '4B0082', 
                 '8B008B', 
                 'FF00FF', 
                 'FF1493', 
                 '8B4513', 
                 'B0C4DE',  
                 '696969', 
                 'FFB6C1', 
                 'FAEBD7', 
                 'F5DEB3', 
                 '00CCCC', 
                 'BC8F8F', 
]

# CONVERT HEX TO RGB (SAMPLES)
colors = np.zeros((len(FLATUI_SAMPLE_M),3))
for ind,hexcolor in enumerate(FLATUI_SAMPLE_M):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_SAMPLES_M = LinearSegmentedColormap.from_list('cm_sample', colors, N=len(colors))
                                              
# CONVERT HEX TO RGB (SOURCE)
colors = np.zeros((len(FLATUI_SOURCE_M),3))
for ind,hexcolor in enumerate(FLATUI_SOURCE_M):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_SOURCE_M = LinearSegmentedColormap.from_list('cm_source', colors, N=len(colors))

# CONVERT HEX TO RGB (CLASS)
colors = np.zeros((len(FLATUI_CLASS_M),3))
for ind,hexcolor in enumerate(FLATUI_CLASS_M):
    colors[ind,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
CM_CLASS_M = LinearSegmentedColormap.from_list('cm_class', colors, N=len(colors))

Metastases that evolved in the presence of surveilling NK cells

Description

We profiled the transcriptomes of 8,748 single cells obtained from 6 spontaneous metastases isolated from 3 bioluminescence (BLI)-negative organs and 3 BLI-positive macrometastases after intracardiac injection of H2087-LCC latent lung adenocaromina cells in athymic mice. One of the micrometastases isolated from a in vivo BLI-negative organ, showed signal upon ex vivo imaging and therefore was annotated as an incipient lesion (Fig. 4a). These samples were isolated by enzymatic and mechanical digestion, and selected for one passage on a plate in antibiotic to which tumor cell were engineered to be resistant. In vitro selection was required to enrich for the exceedingly rare DTCs from BLI negative organs. By immunofluorescence, we observe these persist as single cells or micrometastatic clusters < 10 cells. They show little to weak organ tropsim, therefore, one passage of selection and screening of multiple organ sites is required to find this rare, but important subpopulation of metastatic cells. Given the selection in antibiotic, only tumor cells were sequenced in this experiment. scRNA-seq data from these selected micro- to macro-metastases were merged to create a global atlas of all stages of lung adenocarcinoma progression observed in this xenograft model of metastasis. In this section, we walk through mapping of these mouse metastatic states to the tumor lung epithelial development states observed in human patient metastasis (section 3). "

Load data

In [92]:
subset_type = 'XENOGRAFT_METASTASES_NK'
In [93]:
# LOAD DATA

# RAW COUNTS (Cells x Genes)
exec('DF_{} = h5_data_mouse.load(\'/DF_{}\')'.format(subset_type,subset_type))

# MEDIAN LIBRARY-SIZE NORMALIZED COUNTS (Cells x Genes)
exec('NDF_{} = h5_data_mouse.load(\'/NDF_{}\')'.format(subset_type,subset_type))

# MAGIC IMPUTED AND NORMALIZED DATA (Cells x Genes)
exec('INDF_{} = h5_data_mouse.load(\'/INDF_{}\')'.format(subset_type,subset_type))

# CELLS X DIMENSIONS (including principal components, diffusion components, ect.)
exec('DIMENSIONS_{} = h5_data_mouse.load(\'/DIMENSIONS_{}\')'.format(subset_type,subset_type))

# RANKED DIFFERENTIALLY EXPRESSED GENES BY MAST (EACH CELL TYPE VS. ALL OTHER CELLS)
exec('GENE_RANK_MAST_{} = h5_data_mouse.load(\'/GENE_RANK_MAST_{}\')'.format(subset_type,subset_type))

Visualize meta cell type annotations and key genes

In [94]:
# SPECIFY DIMENSION TYPE
dimtype = 'ForceDirected'

cm = CM_DIVERGING

# PLOT SAMPLE_ID
plt.figure(figsize = (10,10))
gs1 = gridspec.GridSpec(2, 2)
gs1.update(wspace=0.2, hspace=0.2) # set the spacing between axes. 
dot_size = 3

exec('QUERY = INDF_{}'.format(subset_type))
exec('X = DIMENSIONS_{}[\'{}0\']'.format(subset_type,dimtype))
exec('Y = DIMENSIONS_{}[\'{}1\']'.format(subset_type,dimtype))
xmax = np.abs(X).max()
ymax = np.abs(Y).max()
axis_min = -ceil(ceil(max(xmax,ymax))/10)*10
axis_max = ceil(ceil(max(xmax,ymax))/10)*10

# (01) PLOT PHNENGORAPH CLUSTERS
ax = plt.subplot(gs1[0])
meta = 'PHENOGRAPH_CLASS'
seqc.plot.scatter.categorical(X, Y, c= QUERY.index.get_level_values(meta), randomize = True,
                              cmap=CM_CLASS_M,legend_kwargs={'ncol': 1}, s=dot_size, ax=ax);
plt.title('Type', fontname='Helvetica', size=12, weight='normal')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax.legend_.remove()
plt.axis('off')

# (02) PLOT SAMPLE_ID
ax = plt.subplot(gs1[1])
seqc.plot.scatter.categorical(X, Y, c=QUERY.index.get_level_values('Legend'), randomize = True, 
                              cmap=CM_SAMPLES_M, s=dot_size, ax=ax)
plt.title('Sample ID', fontname='Helvetica', size=12, weight='normal')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
plt.legend('')

# (03) PLOT SOX2 EXPRESSION
ax = plt.subplot(gs1[2])
phase = zscore(QUERY['SOX2'])
    
# Plot scatter map with phase score
mask = np.isnan(phase)
    
# Randomize/Sort Color Values before Plotting
color_values = np.array(phase)
i = np.random.permutation(len(color_values))
x = np.array(X)[i]
y = np.array(Y)[i]
color_values = color_values[i]
    
plt.scatter(x[mask], y[mask], s=dot_size, facecolors='none',edgecolors='black') # nans still shown empty
plt.scatter(x, y, c=color_values,cmap=cm, s=dot_size, alpha =0.6)             # finite data
plt.title('SOX2', fontname='Helvetica', size=12, weight='normal')
plt.axis('off')
plt.clim(-1.5,1.5)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.legend('')

# (03) PLOT SOX9 EXPRESSION
ax = plt.subplot(gs1[3])
phase = zscore(QUERY['SOX9'])
    
# Plot scatter map with phase score
mask = np.isnan(phase)
    
# Randomize/Sort Color Values before Plotting
color_values = np.array(phase)
i = np.random.permutation(len(color_values))
x = np.array(X)[i]
y = np.array(Y)[i]
color_values = color_values[i]
    
plt.scatter(x[mask], y[mask], s=dot_size, facecolors='none',edgecolors='black') # nans still shown empty
plt.scatter(x, y, c=color_values,cmap=cm, s=dot_size, alpha =0.6)             # finite data
plt.title('SOX9', fontname='Helvetica', size=12, weight='normal')
plt.axis('off')
plt.clim(-1.5,1.5)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)  
plt.legend('')

# SAVE FIGURE
figure_label = '_SOX2_SOX9_CLUSTER_{}_{}'.format(dimtype,subset_type)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_SOX2_SOX9_CLUSTER_ForceDirected_XENOGRAFT_METASTASES_NK

Correlation with human tumor lung epithelial development states

Select shared, variable genes across datasets

In [95]:
subset_type = 'XENOGRAFT_METASTASES_NK'
exec('QUERY = NDF_{}'.format(subset_type))

# FIT BINOMIAL DISTRIBUTION AND FILTER BASED ON MEAN/STD OF SECOND
plt.figure(figsize = (10,2))
gs1 = gridspec.GridSpec(1, 2)
gs1.update(wspace=0.25, hspace=0.7) # set the spacing between axes. 

# (1) PLOT LOG NUMBER OF CELLS CONTRIBUTING TO EACH GENE
num_cells_per_gene = np.log(np.sum(QUERY.values > 0,axis=0))
num_cells_per_gene[(np.isinf(num_cells_per_gene)) | (np.isnan(num_cells_per_gene))] = 0
keep_genes1 = pd.Series(data=num_cells_per_gene>=1,index = QUERY.columns) # GENES MUST BE DETECTED IN AT LEAST 10 CELLS

ax = plt.subplot(gs1[0])
bins = np.linspace(num_cells_per_gene.min(), num_cells_per_gene.max()*0.95, 20)
plt.hist(num_cells_per_gene, bins, alpha=0.5, label='keep')
plt.hist(num_cells_per_gene[keep_genes1], bins, alpha=1, label='retain',color = 'r')
    
ax.set_facecolor('white')
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('Gene Filter 1: Remove Genes Singletons \n(Log. # Expressing Cells)')
plt.grid(True)
sns.despine()

# (2) PLOT GENE DISPERSION
pseudocount = 0.0001
QUERY = np.log10(QUERY+pseudocount)

x = QUERY.mean(axis=0, skipna=False)
y = QUERY.var(axis=0, skipna=False)
z = np.polyfit(x, y, 6)
p = np.poly1d(z)
yfit = p(x)

# The biological component for each gene is defined as the difference between its total variance and the fitted value of the trend
biological_variance = y-yfit
keep_genes2 = biological_variance>(biological_variance.mean())

ax = plt.subplot(gs1[1])
plt.scatter(x,y, c = 'g',s=1)
plt.scatter(x,yfit,s = 1,c = 'k')
plt.scatter(x[keep_genes2],y[keep_genes2],c='r',s=1)
plt.xlabel('mean log10(expression)')
plt.ylabel('variance log10(expression)')

# SELECT VARIABLE GENES EXPRESSED IN > 10 CELLS
retain_bool = keep_genes1*keep_genes2==True
mouse_goi = list(retain_bool[retain_bool].index)
print(len(mouse_goi))

# SAVE FIGURE
figure_label = '_{}_GENE_FILTERING'.format(subset_type) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
6636
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_XENOGRAFT_METASTASES_NK_GENE_FILTERING
In [96]:
#### IDENTIFY VARIABLY EXPRESSED GENES DETECTED IN > 10 CELL (SINGLE CELL UN-IMPUTED DATA) ####
subset_type = 'EPITHELIAL_NOR_TUMOR_MET'
exec('QUERY = NDF_{}'.format(subset_type))

# FIT BINOMIAL DISTRIBUTION AND FILTER BASED ON MEAN/STD OF SECOND
plt.figure(figsize = (10,2))
gs1 = gridspec.GridSpec(1, 2)
gs1.update(wspace=0.25, hspace=0.7) # set the spacing between axes. 

# (1) PLOT LOG NUMBER OF CELLS CONTRIBUTING TO EACH GENE
num_cells_per_gene = np.log(np.sum(QUERY.values > 0,axis=0))
num_cells_per_gene[(np.isinf(num_cells_per_gene)) | (np.isnan(num_cells_per_gene))] = 0
keep_genes1 = pd.Series(data=num_cells_per_gene>=1,index = QUERY.columns) # GENES MUST BE DETECTED IN AT LEAST 10 CELLS

ax = plt.subplot(gs1[0])
bins = np.linspace(num_cells_per_gene.min(), num_cells_per_gene.max()*0.95, 20)
plt.hist(num_cells_per_gene, bins, alpha=0.5, label='keep')
plt.hist(num_cells_per_gene[keep_genes1], bins, alpha=1, label='retain',color = 'r')
    
ax.set_facecolor('white')
plt.xticks(rotation=70)
plt.ylabel('Frequency')
plt.xlabel('Gene Filter 1: Remove Genes Singletons \n(Log. # Expressing Cells)')
plt.grid(True)
sns.despine()

# (2) PLOT GENE DISPERSION
pseudocount = 0.0001
QUERY = np.log10(QUERY+pseudocount)

x = QUERY.mean(axis=0, skipna=False)
y = QUERY.var(axis=0, skipna=False)
z = np.polyfit(x, y, 6)
p = np.poly1d(z)
yfit = p(x)

# The biological component for each gene is defined as the difference between its total variance and the fitted value of the trend
biological_variance = y-yfit
keep_genes2 = biological_variance>(biological_variance.mean()+0*biological_variance.std())

ax = plt.subplot(gs1[1])
plt.scatter(x,y, c = 'g',s=1)
plt.scatter(x,yfit,s = 1,c = 'k')
plt.scatter(x[keep_genes2],y[keep_genes2],c='r',s=1)
plt.xlabel('mean log10(expression)')
plt.ylabel('variance log10(expression)')

# SELECT VARIABLE GENES EXPRESSED IN > 10 CELLS
retain_bool = keep_genes1*keep_genes2==True
human_goi = list(retain_bool[retain_bool].index)
print(len(human_goi))

# SAVE FIGURE
figure_label = '_{}_GENE_FILTERING'.format(subset_type) 
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label 
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
6030
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_EPITHELIAL_NOR_TUMOR_MET_GENE_FILTERING
In [97]:
# SELECT INTERSECTING, VARIABLE GENES
expressed = list(set(mouse_goi).intersection(human_goi))
print(len(expressed))
2096

Compute aggregate values and center

In [98]:
# HUMAN DATA

# SPECIFY SUBSET
subset_type = 'EPITHELIAL_NOR_TUMOR_MET'
meta = 'DEVELOPMENT_PHENOGRAPH_CLASS'
exec('QUERY = NDF_{}[expressed]'.format(subset_type))

subset_key = 'Meta-Source'
ix = [ind for ind, name in enumerate(QUERY.index.names) if name==subset_key][0]
QUERY = QUERY.select(lambda x: x[ix] in ['TUMOR','MET'])

# Load Data (genes x cells)
patient_lung_ep = pd.DataFrame(data = QUERY.T.values,index = QUERY.columns, 
                               columns = QUERY.index.get_level_values(meta)) # genes by cells labeled by meta
# Median (replace by Geometric Mean)
patient_lung_ep = patient_lung_ep.groupby(level=0, axis=1).median()

# Transpose the data (clusters x genes)
patient_lung_ep = patient_lung_ep.T

# center features across all clusters per gene
patient_lung_ep -= patient_lung_ep.mean(axis=0)

print(patient_lung_ep.shape)
patient_lung_ep.head()
(4, 2096)
Out[98]:
CD151 MTRNR2L1 RBM4 ASRGL1 BCL7A SAMHD1 CPEB4 APOBEC3A GRAP2 SLC17A9 ... PLIN5 C10ORF11 FAT1 TRIM7 CITED1 AC133644.2 SGK1 BAG3 SH3PXD2B PAPSS2
DEVELOPMENT_PHENOGRAPH_CLASS
I-P 0.452273 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -0.248911 0.0 0.0
I-Q 0.892120 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.070827 0.0 0.0
II -0.475705 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -0.248911 0.0 0.0
III -0.868688 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.426994 0.0 0.0

4 rows × 2096 columns

In [99]:
# MOUSE DATA

# SPECIFY SUBSET
subset_type = 'XENOGRAFT_METASTASES_NK'
meta = 'PHENOGRAPH_CLASS'
exec('QUERY = NDF_{}[expressed]'.format(subset_type))

# Load Data (genes x cells)
mouse_lung_ep = pd.DataFrame(data = QUERY.T.values,index = QUERY.columns, 
                             columns = QUERY.index.get_level_values(meta)) # genes by cells labeled by meta

# Assign aggregate value
mouse_lung_ep = mouse_lung_ep.groupby(level=0, axis=1).median()

# Transpose the data (clusters x genes)
mouse_lung_ep = mouse_lung_ep.T
print(mouse_lung_ep.shape)

# ROW COLOR LUT
# CONVERT HEX TO RGB (FLATUI_CLASS)
FLATUI_PLOT = FLATUI_CLASS_M
ind = list(mouse_lung_ep.index)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(sorted(np.unique(ind)), colors[cix,:]))

# center features across all clusters per gene
mouse_lung_ep -= mouse_lung_ep.mean(axis=0)

print(mouse_lung_ep.shape)
mouse_lung_ep.head()
(18, 2096)
(18, 2096)
Out[99]:
CD151 MTRNR2L1 RBM4 ASRGL1 BCL7A SAMHD1 CPEB4 APOBEC3A GRAP2 SLC17A9 ... PLIN5 C10ORF11 FAT1 TRIM7 CITED1 AC133644.2 SGK1 BAG3 SH3PXD2B PAPSS2
PHENOGRAPH_CLASS
0 -0.453985 2.749966 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -0.018515 0.0 0.0
1 -0.073313 -0.437948 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -0.018515 0.0 0.0
2 0.099004 -0.959605 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -0.018515 0.0 0.0
3 -0.453985 3.667647 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -0.018515 0.0 0.0
4 -0.453985 1.356363 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -0.018515 0.0 0.0

5 rows × 2096 columns

Compute genome-wide correlation between aggregate and centered values

In [100]:
rvalues = np.zeros([mouse_lung_ep.shape[0], patient_lung_ep.shape[0]])
pvalues = np.zeros([mouse_lung_ep.shape[0], patient_lung_ep.shape[0]])
stderrs = np.zeros([mouse_lung_ep.shape[0], patient_lung_ep.shape[0]])

for mouse_ind in mouse_lung_ep.index:
    for human_ind,human_ind_name in enumerate(patient_lung_ep.index):
        m = mouse_lung_ep.ix[mouse_ind]
        h = patient_lung_ep.ix[human_ind]
        # Linear Regression
        lr = linregress(m, h)
        slope = lr[0]
        intercept = lr[1]
        rvalue = lr[2]
        pvalue = lr[3]
        stderr = lr[4]
        rvalues[mouse_ind,human_ind] = rvalue
        pvalues[mouse_ind,human_ind] = pvalue
        stderrs[mouse_ind,human_ind] = stderr

# Generate Dataframe
rvalues = pd.DataFrame(
    data=rvalues,
    columns=list(patient_lung_ep.index),
    index=mouse_lung_ep.index)
rvalues.columns.name = 'Patient Single Cell Lung Epithelium'

pvalues = pd.DataFrame(
    data=pvalues,
    columns=list(patient_lung_ep.index),
    index=mouse_lung_ep.index)
pvalues.columns.name = 'Patient Single Cell Lung Epithelium'

stderrs = pd.DataFrame(
    data=stderrs,
    columns=list(patient_lung_ep.index),
    index=mouse_lung_ep.index)
stderrs.columns.name = 'Patient Single Cell Lung Epithelium'
In [101]:
# CLUSTERED HEATMAP OF CORRELATIONS
corrs = rvalues
pvals = pvalues
stds = stderrs

# PLOT CORRELATION OF CLUSTERS WITH BULK RNA Sequencing
method = 'average' # single
metric = 'euclidean' # correlation
linkage = hc.linkage(corrs, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(corrs.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)

# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
cl = ['I-Q','I-P','II','III']#hc.leaves_list(col_linkage)
mat = corrs.ix[r1,cl]
pvals = pvals.ix[r1,cl]

# REMOVE CLUSTERS NOT SIGNIFICANTLY CORRELATED
keep_rows = np.abs(mat).sum(axis=1)>0
mat = mat.ix[list(keep_rows[keep_rows==True].index)]
pvals = pvals.ix[list(keep_rows[keep_rows==True].index)]

# ROW COLORS
ind = list(mat.index) # CLUSTERS
row_colors = pd.Series(ind,index = ind).map(lut)
In [102]:
# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(0.9,4))

# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.12,0.1,0.20,0.6]) # [x0,y0,width,height]
x = 0
y = 0

for c in row_colors:
    pos = (x, y / len(row_colors))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(row_colors), color=c))
    if y >= len(row_colors)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.2,0.1,0.7,0.6])

im = axmatrix.matshow(mat[(pvals<0.05)], aspect='auto', origin='lower', cmap=plt.cm.RdBu_r,vmin=-0.5,vmax=0.5)
xlabels = list(mat.columns)
ylabels = list(mat.index)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.yaxis.set_ticks_position('right')
xtick = plt.xticks(range(len(xlabels)), xlabels, rotation = 90, fontsize = 10,fontname='Arial')
ytick = plt.yticks(range(len(ylabels)), ylabels, rotation = 0, fontsize = 10,fontname='Arial')
axmatrix.grid(False)

# ADD COLORBAR
axcolor = fig.add_axes([1.2,0.1,0.05,0.6])
cbar = plt.colorbar(im, cax=axcolor)

# ADD DENDROGRAM
sch.set_link_color_palette(['#808080', '#808080', '#808080', '#808080','#808080','#808080','#808080'])
ax2 = fig.add_axes([0.038,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#808080')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')

# SAVE FIGURE
figure_label = '_PatientLungEp_NDF_{}_CORRELATIONS_AVERAGE_{}_labeled'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', dpi=400, transparent=True)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_PatientLungEp_NDF_XENOGRAFT_METASTASES_NK_CORRELATIONS_AVERAGE_PHENOGRAPH_CLASS_labeled

Bi-partite representation of mapping between human and mouse metastatic states

In [103]:
import networkx as nx
import matplotlib.pyplot as plt
sns.set_style("white")

B = nx.Graph()
B.add_nodes_from(corrs.index, bipartite=0)
B.add_nodes_from(corrs.columns, bipartite=1)
list(B)

B.add_weighted_edges_from(
    [(idx, 'I-Q',row['I-Q']+1) for idx, row in corrs.iterrows()],weight='weight')
B.add_weighted_edges_from(
    [(idx, 'I-P',row['I-P']+1) for idx, row in corrs.iterrows()],weight='weight')
B.add_weighted_edges_from(
    [(idx, 'II',row['II']+1) for idx, row in corrs.iterrows()],weight='weight')
B.add_weighted_edges_from(
    [(idx, 'III',row['III']+1) for idx, row in corrs.iterrows()],weight='weight')

cutoff = 1.20 
SUB_B=nx.Graph( [ (u,v,d) for u,v,d in B.edges(data=True) if d['weight']>cutoff] )
pos = nx.spring_layout(SUB_B)
list(SUB_B)

color_map = []
edge_color_map = []
size_array = []
for n in SUB_B: 
    if n=='I-P':
        color_map.append('#F0F0F0')
        edge_color_map.append('black')
        size_array.append(2000)
    elif n=='I-Q':
        color_map.append('#E0E0E0')
        edge_color_map.append('black')
        size_array.append(2000)
    elif n=='II':
        color_map.append('#A0A0A0')
        edge_color_map.append('black')
        size_array.append(2000)
    elif n=='III':
        color_map.append('#202020')
        edge_color_map.append('black')
        size_array.append(2000)
    else:
        color_map.append(lut[n])
        edge_color_map.append('white')
        size_array.append(1000)


plt.figure(figsize = (10,10))
ax = plt.gca()

nodes = nx.draw_networkx_labels(SUB_B, pos,with_labels=True,node_size = size_array,
        node_color = color_map,fontsize=3,fontcolor = 'white')
nodes = nx.draw_networkx_nodes(SUB_B, pos,with_labels=True,node_size = size_array,
        node_color = color_map)
nodes.set_edgecolor(edge_color_map)

#4 c. Plot the edges - one by one!
all_weights = []
for (node1,node2,data) in SUB_B.edges(data=True):
    all_weights.append(data['weight']) #we'll use this when determining edge thickness

#4 b. Get unique weights
unique_weights = list(set(all_weights))        
for weight in unique_weights:
    weighted_edges = [(node1,node2) for (node1,node2,edge_attr) in SUB_B.edges(data=True) if edge_attr['weight']==weight]
    width = weight*len(SUB_B)/sum(all_weights)*3

    nx.draw_networkx_edges(SUB_B,pos,edgelist=weighted_edges,width=width) 


# SAVE FIGURE
FIG_output_stem = './figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_'+now+'/'
figure_label = 'BipartiteGraph-Mouse-Human-Correlation-Labeled'
fn = FIG_output_stem + figure_label
plt.savefig(fn + '.png', dpi=400, transparent=True)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/BipartiteGraph-Mouse-Human-Correlation-Labeled

Cells were assigned to quiescent (I-Q), regenerating (I-P/II), and SOX9 high escape (III) as visualized in the bipartite graph above. Clusters that did not signficiantly associate with any of these developmental states (not shown in bipartite) were annotated as 'NA'. All annotations are saved in the multi-index of each dataframe under index name: DEVELOPMENTAL_STATE.

Fraction of meta-source per Phenograph cluster

In [104]:
subset_type = 'XENOGRAFT_METASTASES_NK'
meta = 'Meta-Source'
exec('fractions = DF_{}.groupby(level=[\'PHENOGRAPH_CLASS\', \'{}\'], axis=0).size().unstack().fillna(0)'.format(subset_type,meta))
In [105]:
tissue_cluster_sizes = fractions.iloc[:,:3]
colors = [
                 '#006400', # AGGRESSIVE
                 '#8B4513', # BLINEG
                 '#FF9933', # STABLE MICROMET
]


# Plot parameters
nrow = int(ceil(sqrt(len(tissue_cluster_sizes.index))))
ncol = int(ceil(sqrt(len(tissue_cluster_sizes.index))))
plt.figure(figsize = (10,10))
gs1 = gridspec.GridSpec(nrow, ncol)
gs1.update(wspace=0.25, hspace=0.1) # set the spacing between axes. 
sns.set(font_scale=2.5) 

for ind in np.arange(tissue_cluster_sizes.shape[0]):
    
    ax = plt.subplot(gs1[ind])
    
    plt.pie(
        tissue_cluster_sizes.iloc[ind,:],
        shadow=False,
        colors=colors,
        )
    
    # View the plot drop above
    plt.axis('equal')
    plt.xlabel(tissue_cluster_sizes.index[ind], fontsize = 10)

# SAVE FIGURE
FIG_output_stem = './figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_'+now+'/'
figure_label = 'BipartiteGraph-Mouse-Human-Correlation-PiePlots-Filled'
fn = FIG_output_stem + figure_label
plt.savefig(fn + '.png', dpi=400, transparent=True)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/BipartiteGraph-Mouse-Human-Correlation-PiePlots-Filled

NK ligand expression evalauted per developmental state

In [106]:
# PLOT SAMPLE_ID
subset_type = 'XENOGRAFT_METASTASES_NK'
plt.figure(figsize = (12,5))
gs1 = gridspec.GridSpec(1,4)
gs1.update(wspace=0.35, hspace=0.2) # set the spacing between axes. 

datatype = 'INDF_{}'.format(subset_type)
exec('QUERY = {}'.format(datatype))
sns.set_style("white")
subset_key = 'PHENOGRAPH_CLASS'
ix = [ind for ind, name in enumerate(QUERY.index.names) if name==subset_key][0]

# PHENOGRAPH CLUSTERS THAT SIGNFICANTLY MAPPED TO HUMAN LUNG EP DEVELOPMENT STATE
QUERY = QUERY.select(lambda x: x[ix] in [3,4,10,0,1,5,8,9,16,6,15,12,2,17,13,11,7])

# ANNOATIONS FROM BI-PARTITE GRAPH
development_class_lut = {3:'Escape',
                         4:'Escape',
                         10:'Escape',
                         0:'Escape',
                         1:'Regenerating',
                         5:'Regenerating',
                         8:'Regenerating',
                         9:'Regenerating',
                         16: 'Quiescent',
                         6: 'Quiescent',
                         15: 'Quiescent',
                         12: 'Quiescent',
                         2: 'Quiescent',
                         17: 'Quiescent',
                         13: 'Quiescent',
                         11: 'Quiescent',
                         7: 'Quiescent' }

ind = QUERY.index.get_level_values(subset_key).map(development_class_lut)
palette = {'Quiescent': '#E0E0E0', 'Regenerating': '#A0A0A0', 'Escape': '#202020'}
order = ['Quiescent','Regenerating','Escape']
##############################################################################################################
ax = plt.subplot(gs1[0])
title= 'SOX2'
genes = [title]
genes = np.unique([x for x in genes if str(x) != 'NAN'])
detected_genes = list(set(genes).intersection(set(QUERY.columns)))
vals = QUERY[detected_genes]
SCORE = np.nanmean(vals,axis=1)

# Format data structure for violin plot
violin_data = []
for ii,v in enumerate(SCORE):
    violin_data.append({'gene': title, 'Z-normalized Expression': v,
                        'Development_Class':ind[ii]}) 
violin_data = pd.DataFrame(violin_data)  

# BOXPLOT GENE EXPRESSION
g = sns.boxplot(x="gene", y="Z-normalized Expression", hue='Development_Class',data=violin_data, palette=palette,notch = True, 
                    hue_order = order,fliersize = 4, showmeans=False,linewidth = 1, ax = ax) #order = labels, 
g.set_ylabel("{}".format(datatype),fontsize=10)
g.set_xlabel(" ",fontsize=10,rotation = 90)
g.tick_params(labelsize=10)
sns.despine()
ax.set(ylim=(0, SCORE.max()*0.99))
ax.legend().set_visible(False)

# COMPARE DISTRIBUTIONS
QUIESCENT = violin_data.loc[violin_data['Development_Class'].isin(['Quiescent'])]['Z-normalized Expression'].values
REGENERATING = violin_data.loc[violin_data['Development_Class'].isin(['Regenerating'])]['Z-normalized Expression'].values
ESCAPE = violin_data.loc[violin_data['Development_Class'].isin(['Escape'])]['Z-normalized Expression'].values

print(title + ' {} vs. {}'.format('Quiescent','Regenerating'))
print(stats.mannwhitneyu(QUIESCENT,REGENERATING))

print(title + ' {} vs. {}'.format('Quiescent','Escape'))
print(stats.mannwhitneyu(QUIESCENT,ESCAPE))

print(title + ' {} vs. {}'.format('Regenerating','Escape'))
print(stats.mannwhitneyu(REGENERATING,ESCAPE))
##############################################################################################################
ax = plt.subplot(gs1[1])
title= 'SOX9'
genes = [title]
genes = np.unique([x for x in genes if str(x) != 'NAN'])
detected_genes = list(set(genes).intersection(set(QUERY.columns)))
vals = QUERY[detected_genes]
SCORE = np.nanmean(vals,axis=1)

# Format data structure for violin plot
violin_data = []
for ii,v in enumerate(SCORE):
    violin_data.append({'gene': title, 'Z-normalized Expression': v,
                        'Development_Class':ind[ii]}) 
violin_data = pd.DataFrame(violin_data)  

# BOXPLOT GENE EXPRESSION
g = sns.boxplot(x="gene", y="Z-normalized Expression", hue='Development_Class',data=violin_data, palette=palette,notch = True, 
                    hue_order = order,fliersize = 4, showmeans=False,linewidth = 1, ax = ax) #order = labels, 
g.set_ylabel("{}".format(datatype),fontsize=10)
g.set_xlabel(" ",fontsize=10,rotation = 90)
g.tick_params(labelsize=10)
sns.despine()
ax.set(ylim=(0, SCORE.max()*0.99))
ax.legend().set_visible(False)

# COMPARE DISTRIBUTIONS
QUIESCENT = violin_data.loc[violin_data['Development_Class'].isin(['Quiescent'])]['Z-normalized Expression'].values
REGENERATING = violin_data.loc[violin_data['Development_Class'].isin(['Regenerating'])]['Z-normalized Expression'].values
ESCAPE = violin_data.loc[violin_data['Development_Class'].isin(['Escape'])]['Z-normalized Expression'].values

print(title + ' {} vs. {}'.format('Quiescent','Regenerating'))
print(stats.mannwhitneyu(QUIESCENT,REGENERATING))

print(title + ' {} vs. {}'.format('Quiescent','Escape'))
print(stats.mannwhitneyu(QUIESCENT,ESCAPE))

print(title + ' {} vs. {}'.format('Regenerating','Escape'))
print(stats.mannwhitneyu(REGENERATING,ESCAPE))
##############################################################################################################
ax = plt.subplot(gs1[2])
title = 'NK_ACTIVATING'
genes = ['ULBP1','ULBP2','RAET1E','RAET1G','RAET1L','MICA','MICB']
genes = np.unique([x for x in genes if str(x) != 'NAN'])
detected_genes = list(set(genes).intersection(set(QUERY.columns)))
vals = QUERY[detected_genes]
SCORE = np.nanmean(vals,axis=1)

# Format data structure for violin plot
violin_data = []
for ii,v in enumerate(SCORE):
    violin_data.append({'gene': title, 'Z-normalized Expression': v,
                        'Development_Class':ind[ii]}) 
violin_data = pd.DataFrame(violin_data)  

# BOXPLOT GENE EXPRESSION
g = sns.boxplot(x="gene", y="Z-normalized Expression", hue='Development_Class',data=violin_data, palette=palette,notch = True, 
                    hue_order = order,fliersize = 4, showmeans=False,linewidth = 1, ax = ax) #order = labels, 
g.set_ylabel("{}".format(datatype),fontsize=10)
g.set_xlabel(" ",fontsize=10,rotation = 90)
g.tick_params(labelsize=10)
sns.despine()
ax.set(ylim=(0, SCORE.max()*0.99))
ax.legend().set_visible(False)

# COMPARE DISTRIBUTIONS
QUIESCENT = violin_data.loc[violin_data['Development_Class'].isin(['Quiescent'])]['Z-normalized Expression'].values
REGENERATING = violin_data.loc[violin_data['Development_Class'].isin(['Regenerating'])]['Z-normalized Expression'].values
ESCAPE = violin_data.loc[violin_data['Development_Class'].isin(['Escape'])]['Z-normalized Expression'].values

print(title + ' {} vs. {}'.format('Quiescent','Regenerating'))
print(stats.mannwhitneyu(QUIESCENT,REGENERATING))

print(title + ' {} vs. {}'.format('Quiescent','Escape'))
print(stats.mannwhitneyu(QUIESCENT,ESCAPE))

print(title + ' {} vs. {}'.format('Regenerating','Escape'))
print(stats.mannwhitneyu(REGENERATING,ESCAPE))
##############################################################################################################
ax = plt.subplot(gs1[3])
title = 'NK_INHIBITORY'
genes = ['HLA-A','HLA-B','HLA-C','B2M']
genes = np.unique([x for x in genes if str(x) != 'NAN'])
detected_genes = list(set(genes).intersection(set(QUERY.columns)))
vals = QUERY[detected_genes]
SCORE = np.nanmean(vals,axis=1)

# Format data structure for violin plot
violin_data = []
for ii,v in enumerate(SCORE):
    violin_data.append({'gene': title, 'Z-normalized Expression': v,
                        'Development_Class':ind[ii]}) 
violin_data = pd.DataFrame(violin_data)  

# BOXPLOT GENE EXPRESSION
g = sns.boxplot(x="gene", y="Z-normalized Expression", hue='Development_Class',data=violin_data, palette=palette,notch = True, 
                    hue_order = order,fliersize = 4, showmeans=False,linewidth = 1, ax = ax) #order = labels, 
g.set_ylabel("{}".format(datatype),fontsize=10)
g.set_xlabel(" ",fontsize=10,rotation = 90)
g.tick_params(labelsize=10)
sns.despine()
ax.set(ylim=(0, SCORE.max()*0.99))
ax.legend().set_visible(False)

# COMPARE DISTRIBUTIONS
QUIESCENT = violin_data.loc[violin_data['Development_Class'].isin(['Quiescent'])]['Z-normalized Expression'].values
REGENERATING = violin_data.loc[violin_data['Development_Class'].isin(['Regenerating'])]['Z-normalized Expression'].values
ESCAPE = violin_data.loc[violin_data['Development_Class'].isin(['Escape'])]['Z-normalized Expression'].values

print(title + ' {} vs. {}'.format('Quiescent','Regenerating'))
print(stats.mannwhitneyu(QUIESCENT,REGENERATING))

print(title + ' {} vs. {}'.format('Quiescent','Escape'))
print(stats.mannwhitneyu(QUIESCENT,ESCAPE))

print(title + ' {} vs. {}'.format('Regenerating','Escape'))
print(stats.mannwhitneyu(REGENERATING,ESCAPE))
##############################################################################################################

# SAVE FIGURE
figure_label = '_boxplots_NKligands_{}_developmentclass'.format(subset_type)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', dpi=400, transparent=True)
print(fn)
SOX2 Quiescent vs. Regenerating
MannwhitneyuResult(statistic=2682177.0, pvalue=1.754993509567453e-87)
SOX2 Quiescent vs. Escape
MannwhitneyuResult(statistic=1111167.0, pvalue=0.0)
SOX2 Regenerating vs. Escape
MannwhitneyuResult(statistic=401885.0, pvalue=0.0)
SOX9 Quiescent vs. Regenerating
MannwhitneyuResult(statistic=3766326.0, pvalue=0.019671422785504804)
SOX9 Quiescent vs. Escape
MannwhitneyuResult(statistic=415792.0, pvalue=0.0)
SOX9 Regenerating vs. Escape
MannwhitneyuResult(statistic=470960.0, pvalue=0.0)
NK_ACTIVATING Quiescent vs. Regenerating
MannwhitneyuResult(statistic=2422711.0, pvalue=5.956534930084386e-128)
NK_ACTIVATING Quiescent vs. Escape
MannwhitneyuResult(statistic=3602815.0, pvalue=7.136278536863184e-69)
NK_ACTIVATING Regenerating vs. Escape
MannwhitneyuResult(statistic=2439823.0, pvalue=6.053718985150174e-23)
NK_INHIBITORY Quiescent vs. Regenerating
MannwhitneyuResult(statistic=3629503.0, pvalue=8.605442054086639e-06)
NK_INHIBITORY Quiescent vs. Escape
MannwhitneyuResult(statistic=1099207.0, pvalue=0.0)
NK_INHIBITORY Regenerating vs. Escape
MannwhitneyuResult(statistic=388211.0, pvalue=0.0)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_boxplots_NKligands_XENOGRAFT_METASTASES_NK_developmentclass

Metastases that evolved with NK cell depletion

Description

We profiled the transcriptomes of 6,073 single cells obtained from 8 H2087-LCC macro-metastases, whose outgrowth was triggered by NK cell depletion (IP administration of anti-GM1). Tumor cells were freshly isolated from enzymatically and mechanically dissocaited macrometastases by flow sorting of GFP+ cells. Macrometastatic growths were identified and tracked by longitudinal bioluminescence imaging. scRNA-seq data from these freshly isoalted macrometastatic outgrowths were merged to create a global atlas of cell states observed in macrometastases that evolved without the selective pressure of NK cell surveillance. In this section, we walk through mapping of these NK cell-depleted mouse metastatic states to the tumor lung epithelial development states observed in human patient metastasis (section 3).

Load data

In [107]:
subset_type = 'XENOGRAFT_METASTASES_NK_DEPLETED'
In [108]:
# LOAD DATA

# RAW COUNTS (Cells x Genes)
exec('DF_{} = h5_data_mouse.load(\'/DF_{}\')'.format(subset_type,subset_type))

# MEDIAN LIBRARY-SIZE NORMALIZED COUNTS (Cells x Genes)
exec('NDF_{} = h5_data_mouse.load(\'/NDF_{}\')'.format(subset_type,subset_type))

# MAGIC IMPUTED AND NORMALIZED DATA (Cells x Genes)
exec('INDF_{} = h5_data_mouse.load(\'/INDF_{}\')'.format(subset_type,subset_type))

# CELLS X DIMENSIONS (including principal components, diffusion components, ect.)
exec('DIMENSIONS_{} = h5_data_mouse.load(\'/DIMENSIONS_{}\')'.format(subset_type,subset_type))

# RANKED DIFFERENTIALLY EXPRESSED GENES BY MAST (EACH CELL TYPE VS. ALL OTHER CELLS)
exec('GENE_RANK_MAST_{} = h5_data_mouse.load(\'/GENE_RANK_MAST_{}\')'.format(subset_type,subset_type))

Visualize meta cell type annotations and key genes

In [109]:
# SPECIFY DIMENSION TYPE
dimtype = 'ForceDirected'

cm = CM_DIVERGING

# PLOT SAMPLE_ID
plt.figure(figsize = (10,10))
gs1 = gridspec.GridSpec(2, 2)
gs1.update(wspace=0.2, hspace=0.2) # set the spacing between axes. 
dot_size = 3

exec('QUERY = INDF_{}'.format(subset_type))
exec('X = DIMENSIONS_{}[\'{}0\']'.format(subset_type,dimtype))
exec('Y = DIMENSIONS_{}[\'{}1\']'.format(subset_type,dimtype))
xmax = np.abs(X).max()
ymax = np.abs(Y).max()
axis_min = -ceil(ceil(max(xmax,ymax))/10)*10
axis_max = ceil(ceil(max(xmax,ymax))/10)*10

# (01) PLOT PHNENGORAPH CLUSTERS
ax = plt.subplot(gs1[0])
meta = 'PHENOGRAPH_CLASS'
seqc.plot.scatter.categorical(X, Y, c= QUERY.index.get_level_values(meta), randomize = True,
                              cmap=CM_CLASS_M,legend_kwargs={'ncol': 1}, s=dot_size, ax=ax);
plt.title('Type', fontname='Helvetica', size=12, weight='normal')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
sns.despine()
ax.legend_.remove()
plt.axis('off')

# (02) PLOT SAMPLE_ID
ax = plt.subplot(gs1[1])
seqc.plot.scatter.categorical(X, Y, c=QUERY.index.get_level_values('Legend'), randomize = True, 
                              cmap=CM_CLASS_M, s=dot_size, ax=ax)
plt.title('Sample ID', fontname='Helvetica', size=12, weight='normal')
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.axis('off')
plt.legend('')

# (03) PLOT SOX2 EXPRESSION
ax = plt.subplot(gs1[2])
phase = zscore(QUERY['SOX2'])
    
# Plot scatter map with phase score
mask = np.isnan(phase)
    
# Randomize/Sort Color Values before Plotting
color_values = np.array(phase)
i = np.random.permutation(len(color_values))
x = np.array(X)[i]
y = np.array(Y)[i]
color_values = color_values[i]
    
plt.scatter(x[mask], y[mask], s=dot_size, facecolors='none',edgecolors='black') # nans still shown empty
plt.scatter(x, y, c=color_values,cmap=cm, s=dot_size, alpha =0.6)             # finite data
plt.title('SOX2', fontname='Helvetica', size=12, weight='normal')
plt.axis('off')
plt.clim(-1.5,1.5)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)
plt.legend('')

# (03) PLOT SOX9 EXPRESSION
ax = plt.subplot(gs1[3])
phase = zscore(QUERY['SOX9'])
    
# Plot scatter map with phase score
mask = np.isnan(phase)
    
# Randomize/Sort Color Values before Plotting
color_values = np.array(phase)
i = np.random.permutation(len(color_values))
x = np.array(X)[i]
y = np.array(Y)[i]
color_values = color_values[i]
    
plt.scatter(x[mask], y[mask], s=dot_size, facecolors='none',edgecolors='black') # nans still shown empty
plt.scatter(x, y, c=color_values,cmap=cm, s=dot_size, alpha =0.6)             # finite data
plt.title('SOX9', fontname='Helvetica', size=12, weight='normal')
plt.axis('off')
plt.clim(-1.5,1.5)
plt.xlim(axis_min,axis_max)
plt.ylim(axis_min,axis_max)  
plt.legend('')

# SAVE FIGURE
figure_label = '_SOX2_SOX9_CLUSTER_{}_{}'.format(dimtype,subset_type)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', bbox_inches='tight',dpi=fig_dpi)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_SOX2_SOX9_CLUSTER_ForceDirected_XENOGRAFT_METASTASES_NK_DEPLETED

Correlation with human tumor lung epithelial development states

Select shared, variable genes across datasets

In [110]:
subset_type = 'XENOGRAFT_METASTASES_NK_DEPLETED'
exec('QUERY = NDF_{}'.format(subset_type))

matplotlib.rc('xtick', labelsize=12) 
matplotlib.rc('ytick', labelsize=12)

# FIT BINOMIAL DISTRIBUTION AND FILTER BASED ON MEAN/STD OF SECOND
plt.figure(figsize = (10,2))
gs1 = gridspec.GridSpec(1, 2)
gs1.update(wspace=0.25, hspace=0.7) # set the spacing between axes. 

# (1) PLOT LOG NUMBER OF CELLS CONTRIBUTING TO EACH GENE
num_cells_per_gene = np.log(np.sum(QUERY.values > 0,axis=0))
num_cells_per_gene[(np.isinf(num_cells_per_gene)) | (np.isnan(num_cells_per_gene))] = 0
keep_genes1 = pd.Series(data=num_cells_per_gene>=1,index = QUERY.columns) # GENES MUST BE DETECTED IN AT LEAST 10 CELLS

ax = plt.subplot(gs1[0])
bins = np.linspace(num_cells_per_gene.min(), num_cells_per_gene.max()*0.95, 20)
plt.hist(num_cells_per_gene, bins, alpha=0.5, label='keep')
plt.hist(num_cells_per_gene[keep_genes1], bins, alpha=1, label='retain',color = 'r')
    
ax.set_facecolor('white')
plt.xticks(rotation=70)
plt.ylabel('Frequency', fontsize=12)
plt.xlabel('Gene Filter 1: Remove Genes Singletons \n(Log. # Expressing Cells)', fontsize=12)
plt.grid(True)
sns.despine()

# (2) PLOT GENE DISPERSION
pseudocount = 0.0001
QUERY = np.log10(QUERY+pseudocount)

x = QUERY.mean(axis=0, skipna=False)
y = QUERY.var(axis=0, skipna=False)
z = np.polyfit(x, y, 6)
p = np.poly1d(z)
yfit = p(x)

# The biological component for each gene is defined as the difference between its total variance and the fitted value of the trend
biological_variance = y-yfit
keep_genes2 = biological_variance>(biological_variance.mean())

ax = plt.subplot(gs1[1])
plt.scatter(x,y, c = 'g',s=1)
plt.scatter(x,yfit,s = 1,c = 'k')
plt.scatter(x[keep_genes2],y[keep_genes2],c='r',s=1)
plt.xlabel('mean log10(expression)', fontsize=12)
plt.ylabel('variance log10(expression)', fontsize=12)

# SELECT VARIABLE GENES EXPRESSED IN > 10 CELLS
retain_bool = keep_genes1*keep_genes2==True
mouse_goi = list(retain_bool[retain_bool].index)
print(len(mouse_goi))
7852
In [111]:
#### IDENTIFY VARIABLY EXPRESSED GENES DETECTED IN > 10 CELL (SINGLE CELL UN-IMPUTED DATA) ####
subset_type = 'EPITHELIAL_NOR_TUMOR_MET'
exec('QUERY = NDF_{}'.format(subset_type))

# FIT BINOMIAL DISTRIBUTION AND FILTER BASED ON MEAN/STD OF SECOND
plt.figure(figsize = (10,2))
gs1 = gridspec.GridSpec(1, 2)
gs1.update(wspace=0.25, hspace=0.7) # set the spacing between axes. 

# (1) PLOT LOG NUMBER OF CELLS CONTRIBUTING TO EACH GENE
num_cells_per_gene = np.log(np.sum(QUERY.values > 0,axis=0))
num_cells_per_gene[(np.isinf(num_cells_per_gene)) | (np.isnan(num_cells_per_gene))] = 0
keep_genes1 = pd.Series(data=num_cells_per_gene>=1,index = QUERY.columns) # GENES MUST BE DETECTED IN AT LEAST 10 CELLS

ax = plt.subplot(gs1[0])
bins = np.linspace(num_cells_per_gene.min(), num_cells_per_gene.max()*0.95, 20)
plt.hist(num_cells_per_gene, bins, alpha=0.5, label='keep')
plt.hist(num_cells_per_gene[keep_genes1], bins, alpha=1, label='retain',color = 'r')
    
ax.set_facecolor('white')
plt.xticks(rotation=70)
plt.ylabel('Frequency', fontsize=12)
plt.xlabel('Gene Filter 1: Remove Genes Singletons \n(Log. # Expressing Cells)', fontsize=12)
plt.grid(True)
sns.despine()

# (2) PLOT GENE DISPERSION
pseudocount = 0.0001
QUERY = np.log10(QUERY+pseudocount)

x = QUERY.mean(axis=0, skipna=False)
y = QUERY.var(axis=0, skipna=False)
z = np.polyfit(x, y, 6)
p = np.poly1d(z)
yfit = p(x)

# The biological component for each gene is defined as the difference between its total variance and the fitted value of the trend
biological_variance = y-yfit
keep_genes2 = biological_variance>(biological_variance.mean()+0*biological_variance.std())

ax = plt.subplot(gs1[1])
plt.scatter(x,y, c = 'g',s=1)
plt.scatter(x,yfit,s = 1,c = 'k')
plt.scatter(x[keep_genes2],y[keep_genes2],c='r',s=1)
plt.xlabel('mean log10(expression)', fontsize=12)
plt.ylabel('variance log10(expression)', fontsize=12)

# SELECT VARIABLE GENES EXPRESSED IN > 10 CELLS
retain_bool = keep_genes1*keep_genes2==True
human_goi = list(retain_bool[retain_bool].index)
print(len(human_goi))
6030
In [112]:
# SELECT INTERSECTING, VARIABLE GENES
expressed = list(set(mouse_goi).intersection(human_goi))
print(len(expressed))
2895

Compute aggregate values and center

In [113]:
# HUMAN DATA

# SPECIFY SUBSET
subset_type = 'EPITHELIAL_NOR_TUMOR_MET'
meta = 'DEVELOPMENT_PHENOGRAPH_CLASS'
exec('QUERY = NDF_{}[expressed]'.format(subset_type))

subset_key = 'Meta-Source'
ix = [ind for ind, name in enumerate(QUERY.index.names) if name==subset_key][0]
QUERY = QUERY.select(lambda x: x[ix] in ['TUMOR','MET'])

# Load Data (genes x cells)
patient_lung_ep = pd.DataFrame(data = QUERY.T.values,index = QUERY.columns, 
                               columns = QUERY.index.get_level_values(meta)) # genes by cells labeled by meta
# Median (replace by Geometric Mean)
patient_lung_ep = patient_lung_ep.groupby(level=0, axis=1).median()

# Transpose the data (clusters x genes)
patient_lung_ep = patient_lung_ep.T

# center features across all clusters per gene
patient_lung_ep -= patient_lung_ep.mean(axis=0)

print(patient_lung_ep.shape)
patient_lung_ep.head()
(4, 2895)
Out[113]:
RBM4 MTRNR2L1 ASRGL1 BCL7A MT1E RP11-368I7.4 CRIP2 SAMHD1 RP6-91H8.3 TOX3 ... SGK1 C4ORF47 TSPAN8 SH3PXD2B TMEM169 HN1 RP11-1275H24.2 CHST4 PAPSS2 MET
DEVELOPMENT_PHENOGRAPH_CLASS
I-P 0.0 0.0 0.0 0.0 -0.946877 0.0 -0.019065 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 -0.196332 0.0 0.0 0.0 -1.704586
I-Q 0.0 0.0 0.0 0.0 10.868391 0.0 0.470118 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.175309 0.0 0.0 0.0 5.113758
II 0.0 0.0 0.0 0.0 -5.329205 0.0 -0.828227 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.358575 0.0 0.0 0.0 -1.704586
III 0.0 0.0 0.0 0.0 -4.592309 0.0 0.377173 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 -0.337553 0.0 0.0 0.0 -1.704586

4 rows × 2895 columns

In [114]:
# MOUSE DATA

# SPECIFY SUBSET
subset_type = 'XENOGRAFT_METASTASES_NK_DEPLETED'
meta = 'PHENOGRAPH_CLASS'
exec('QUERY = NDF_{}[expressed]'.format(subset_type))

# Load Data (genes x cells)
mouse_lung_ep = pd.DataFrame(data = QUERY.T.values,index = QUERY.columns, 
                             columns = QUERY.index.get_level_values(meta)) # genes by cells labeled by meta

# Assign aggregate value
mouse_lung_ep = mouse_lung_ep.groupby(level=0, axis=1).median()

# Transpose the data (clusters x genes)
mouse_lung_ep = mouse_lung_ep.T
print(mouse_lung_ep.shape)

# ROW COLOR LUT
# CONVERT HEX TO RGB (FLATUI_CLASS)
FLATUI_PLOT = FLATUI_CLASS_M
ind = list(mouse_lung_ep.index)
colors = np.zeros((len(FLATUI_PLOT),3))
for ii,hexcolor in enumerate(FLATUI_PLOT):
    colors[ii,:] = tuple(hex(hexcolor).rgb)
colors = np.divide(colors,255)
cix = (np.linspace(0,shape(colors)[0],len(np.unique(ind)))).astype(int)
if cix[len(cix)-1]==shape(colors)[0]:
    cix[len(cix)-1]=shape(colors)[0]-1
lut = dict(zip(sorted(np.unique(ind)), colors[cix,:]))

# center features across all clusters per gene
mouse_lung_ep -= mouse_lung_ep.mean(axis=0)

print(mouse_lung_ep.shape)
mouse_lung_ep.head()
(18, 2895)
(18, 2895)
Out[114]:
RBM4 MTRNR2L1 ASRGL1 BCL7A MT1E RP11-368I7.4 CRIP2 SAMHD1 RP6-91H8.3 TOX3 ... SGK1 C4ORF47 TSPAN8 SH3PXD2B TMEM169 HN1 RP11-1275H24.2 CHST4 PAPSS2 MET
PHENOGRAPH_CLASS
0 0.0 0.869775 0.0 0.0 -0.119185 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 -0.158192 0.0 0.0 0.0 0.0
1 0.0 -1.071085 0.0 0.0 -0.541034 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.162584 0.0 0.0 0.0 0.0
2 0.0 -1.819616 0.0 0.0 0.414009 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.014010 0.0 0.0 0.0 0.0
3 0.0 -1.359861 0.0 0.0 0.211456 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.201907 0.0 0.0 0.0 0.0
4 0.0 -1.074404 0.0 0.0 0.828978 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.433242 0.0 0.0 0.0 0.0

5 rows × 2895 columns

Compute genome-wide correlation between agreggate and centered values

In [115]:
rvalues = np.zeros([mouse_lung_ep.shape[0], patient_lung_ep.shape[0]])
pvalues = np.zeros([mouse_lung_ep.shape[0], patient_lung_ep.shape[0]])
stderrs = np.zeros([mouse_lung_ep.shape[0], patient_lung_ep.shape[0]])

for mouse_ind in mouse_lung_ep.index:
    for human_ind,human_ind_name in enumerate(patient_lung_ep.index):
        m = mouse_lung_ep.ix[mouse_ind]
        h = patient_lung_ep.ix[human_ind]
        # Linear Regression
        lr = linregress(m, h)
        slope = lr[0]
        intercept = lr[1]
        rvalue = lr[2]
        pvalue = lr[3]
        stderr = lr[4]
        rvalues[mouse_ind,human_ind] = rvalue
        pvalues[mouse_ind,human_ind] = pvalue
        stderrs[mouse_ind,human_ind] = stderr

# Generate Dataframe
rvalues = pd.DataFrame(
    data=rvalues,
    columns=list(patient_lung_ep.index),
    index=mouse_lung_ep.index)
rvalues.columns.name = 'Patient Single Cell Lung Epithelium'

pvalues = pd.DataFrame(
    data=pvalues,
    columns=list(patient_lung_ep.index),
    index=mouse_lung_ep.index)
pvalues.columns.name = 'Patient Single Cell Lung Epithelium'

stderrs = pd.DataFrame(
    data=stderrs,
    columns=list(patient_lung_ep.index),
    index=mouse_lung_ep.index)
stderrs.columns.name = 'Patient Single Cell Lung Epithelium'
In [116]:
# CLUSTERED HEATMAP OF CORRELATIONS
corrs = rvalues
pvals = pvalues
stds = stderrs

# PLOT CORRELATION OF CLUSTERS WITH BULK RNA Sequencing
method = 'average' # single
metric = 'euclidean' # correlation
linkage = hc.linkage(corrs, method=method, metric = metric)
row_linkage = deepcopy(linkage)
linkage = hc.linkage(corrs.T, method=method, metric = metric)
col_linkage = deepcopy(linkage)

# REORDER HEATMAP ACCORDING TO LINKAGE (OPTIONAL, STILL SLOW)
r1 = hc.leaves_list(row_linkage)
cl = ['I-Q','I-P','II','III']#hc.leaves_list(col_linkage)
mat = corrs.ix[r1,cl]
pvals = pvals.ix[r1,cl]

# REMOVE CLUSTERS NOT SIGNIFICANTLY CORRELATED
keep_rows = np.abs(mat).sum(axis=1)>0
mat = mat.ix[list(keep_rows[keep_rows==True].index)]
pvals = pvals.ix[list(keep_rows[keep_rows==True].index)]

# ROW COLORS
ind = list(mat.index) # CLUSTERS
row_colors = pd.Series(ind,index = ind).map(lut)
In [117]:
# VIEW CLUSTERED LABELED HEATMAP AND DENDROGRAM 
fig = plt.figure(figsize=(0.9,4))

# ADD ROW COLOR INDEX (CELL OF ORIGIN)
ax1 = fig.add_axes([0.12,0.1,0.20,0.6]) # [x0,y0,width,height]
x = 0
y = 0

for c in row_colors:
    pos = (x, y / len(row_colors))
    ax1.add_patch(patches.Rectangle(pos, 1, 1 / len(row_colors), color=c))
    if y >= len(row_colors)-1:
        x += 1
        y = 0
    else:
        y += 1
plt.axis('off')

# ADD MATRIX WITH GENE NAMES
axmatrix = fig.add_axes([0.2,0.1,0.7,0.6])

im = axmatrix.matshow(mat[(pvals<0.05)], aspect='auto', origin='lower', cmap=plt.cm.RdBu_r,vmin=-0.5,vmax=0.5)
xlabels = list(mat.columns)
ylabels = list(mat.index)
axmatrix.xaxis.set_ticks_position('bottom')
axmatrix.yaxis.set_ticks_position('right')
xtick = plt.xticks(range(len(xlabels)), xlabels, rotation = 90, fontsize = 10,fontname='Arial')
ytick = plt.yticks(range(len(ylabels)), ylabels, rotation = 0, fontsize = 10,fontname='Arial')
axmatrix.grid(False)

# ADD COLORBAR
axcolor = fig.add_axes([1.2,0.1,0.05,0.6])
cbar = plt.colorbar(im, cax=axcolor)

# ADD DENDROGRAM
sch.set_link_color_palette(['#808080', '#808080', '#808080', '#808080','#808080','#808080','#808080'])
ax2 = fig.add_axes([0.038,0.1,0.08,0.6]) # [x0,y0,width,height]
Z1 = sch.dendrogram(row_linkage, orientation='left',above_threshold_color='#808080')
ax2.set_xticks([])
ax2.set_yticks([])
plt.axis('off')

# SAVE FIGURE
figure_label = '_PatientLungEp_NDF_{}_CORRELATIONS_AVERAGE_{}_labeled'.format(subset_type,meta)
fn = FIG_output_stem + FN.replace(".h5", "") + figure_label
plt.savefig(fn + '.png', dpi=400, transparent=True)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATED_PatientLungEp_NDF_XENOGRAFT_METASTASES_NK_DEPLETED_CORRELATIONS_AVERAGE_PHENOGRAPH_CLASS_labeled

Bi-partite representation of mapping between human and mouse metastatic states

In [118]:
import networkx as nx
import matplotlib.pyplot as plt
sns.set_style("white")

B = nx.Graph()
B.add_nodes_from(corrs.index, bipartite=0)
B.add_nodes_from(corrs.columns, bipartite=1)
list(B)

B.add_weighted_edges_from(
    [(idx, 'I-Q',row['I-Q']+1) for idx, row in corrs.iterrows()],weight='weight')
B.add_weighted_edges_from(
    [(idx, 'I-P',row['I-P']+1) for idx, row in corrs.iterrows()],weight='weight')
B.add_weighted_edges_from(
    [(idx, 'II',row['II']+1) for idx, row in corrs.iterrows()],weight='weight')
B.add_weighted_edges_from(
    [(idx, 'III',row['III']+1) for idx, row in corrs.iterrows()],weight='weight')

cutoff = 1.20 
SUB_B=nx.Graph( [ (u,v,d) for u,v,d in B.edges(data=True) if d['weight']>cutoff] )
pos = nx.spring_layout(SUB_B)
list(SUB_B)

color_map = []
edge_color_map = []
size_array = []
for n in SUB_B: 
    if n=='I-P':
        color_map.append('#F0F0F0')
        edge_color_map.append('black')
        size_array.append(2000)
    elif n=='I-Q':
        color_map.append('#E0E0E0')
        edge_color_map.append('black')
        size_array.append(2000)
    elif n=='II':
        color_map.append('#A0A0A0')
        edge_color_map.append('black')
        size_array.append(2000)
    elif n=='III':
        color_map.append('#202020')
        edge_color_map.append('black')
        size_array.append(2000)
    else:
        color_map.append(lut[n])
        edge_color_map.append('white')
        size_array.append(1000)

plt.figure(figsize = (10,10))
ax = plt.gca()

nodes = nx.draw_networkx_labels(SUB_B, pos,with_labels=True,node_size = size_array,
        node_color = color_map,fontsize=3,fontcolor = 'white')
nodes = nx.draw_networkx_nodes(SUB_B, pos,with_labels=True,node_size = size_array,
        node_color = color_map)
nodes.set_edgecolor(edge_color_map)

#4 c. Plot the edges - one by one!
all_weights = []
for (node1,node2,data) in SUB_B.edges(data=True):
    all_weights.append(data['weight']) #we'll use this when determining edge thickness

#4 b. Get unique weights
unique_weights = list(set(all_weights))        
for weight in unique_weights:
    weighted_edges = [(node1,node2) for (node1,node2,edge_attr) in SUB_B.edges(data=True) if edge_attr['weight']==weight]
    width = weight*len(SUB_B)/sum(all_weights)*3

    nx.draw_networkx_edges(SUB_B,pos,edgelist=weighted_edges,width=width) 


# SAVE FIGURE
FIG_output_stem = './figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_'+now+'/'
figure_label = 'BipartiteGraph-NKdepletedMouse-Human-Correlation-Labeled'
fn = FIG_output_stem + figure_label
plt.savefig(fn + '.png', dpi=400, transparent=True)
print(fn)
./figures/PATIENT_LUNG_ADENOCARCINOMA_ANNOTATEDdemo_11_12_19/BipartiteGraph-NKdepletedMouse-Human-Correlation-Labeled

Cells were assigned to quiescent (I-Q), regenerating (I-P/II), and SOX9 high escape (III) as visualized in the bipartite graph above. Clusters that did not signficiantly associate with any of these developmental states (not shown in bipartite) were annotated as 'NA'. All annotations are saved in the multi-index of each dataframe under index name: DEVELOPMENTAL_STATE.

Compare developmental cell state abundance in spontaneous vs. NK cell depleted macrometastases

In [119]:
subset_type = 'XENOGRAFT_METASTASES_NK_DEPLETED'
meta = 'DEVELOPMENTAL_STATE'

exec('QUERY = INDF_{}'.format(subset_type))
tissue_cluster_sizes = QUERY.groupby(level=['Legend', '{}'.format(meta)], axis=0).size().unstack().fillna(0)

NK_DEPLETED = tissue_cluster_sizes.div(tissue_cluster_sizes.sum(axis=1),axis=0)
NK_DEPLETED['I/II'] = NK_DEPLETED['I-P/II']+NK_DEPLETED['I-Q']
NK_DEPLETED['TYPE'] = ['NK_DEPLETED']*NK_DEPLETED.shape[0]
NK_DEPLETED
Out[119]:
DEVELOPMENTAL_STATE I-P/II I-Q III NA I/II TYPE
Legend
antiGM1_M4531BO 0.014733 0.309392 0.084715 0.591160 0.324125 NK_DEPLETED
antiGM1_M4531KD 0.414765 0.092617 0.362416 0.130201 0.507383 NK_DEPLETED
antiGM1_M4544KD 0.040602 0.812030 0.048120 0.099248 0.852632 NK_DEPLETED
antiGM1_M4548BO 0.131884 0.386957 0.033333 0.447826 0.518841 NK_DEPLETED
antiGM1_M4549KD1 0.110534 0.320475 0.147626 0.421365 0.431009 NK_DEPLETED
antiGM1_M4549KD2 0.021649 0.342268 0.075258 0.560825 0.363918 NK_DEPLETED
antiGM1_M4737BO 0.423729 0.246973 0.152542 0.176755 0.670702 NK_DEPLETED
antiGM1_M4737KD 0.027182 0.776824 0.011445 0.184549 0.804006 NK_DEPLETED
In [120]:
subset_type = 'XENOGRAFT_METASTASES_NK'
meta = 'DEVELOPMENTAL_STATE'

exec('QUERY = INDF_{}'.format(subset_type))
tissue_cluster_sizes = QUERY.groupby(level=['Legend', '{}'.format(meta)], axis=0).size().unstack().fillna(0)

SPONTANEOUS = tissue_cluster_sizes.div(tissue_cluster_sizes.sum(axis=1),axis=0)
SPONTANEOUS['I/II'] = SPONTANEOUS['I-P/II']+SPONTANEOUS['I-Q']
# SELECT AGGRESSIVE MACROMETASTSESE FOR COMPARISON
SPONTANEOUS = SPONTANEOUS.ix[['EarlySpont_Lung_M4766','EarlySpont_Lung_M4768','EarlySpont_Lung_M4770']]
SPONTANEOUS['TYPE'] = ['Spontaneous']*SPONTANEOUS.shape[0]
SPONTANEOUS
Out[120]:
DEVELOPMENTAL_STATE I-P/II I-Q III NA I/II TYPE
Legend
EarlySpont_Lung_M4766 0.106899 0.206975 0.517058 0.169067 0.313874 Spontaneous
EarlySpont_Lung_M4768 0.105622 0.047700 0.828790 0.017888 0.153322 Spontaneous
EarlySpont_Lung_M4770 0.042419 0.091155 0.851083 0.015343 0.133574 Spontaneous
In [121]:
# SOURCE, CELL TYPE, FRACTION
outbreaks = SPONTANEOUS.append(NK_DEPLETED)
plot_vals = ['I-Q','I-P/II','I/II','III','NA']
boxplot_data = pd.DataFrame()
for val in plot_vals:
    current_df = pd.DataFrame()
    current_df['FRACTION'] = outbreaks[val]
    current_df['TYPE'] = outbreaks['TYPE'].values
    current_df['STATE'] = [val]*outbreaks.shape[0]
    boxplot_data = boxplot_data.append(current_df)

# GROUPED BOXPLOT WITH DATA POINTS OVERLAYED
fig = plt.figure(figsize = (3,5))
ax1 = plt.gca()
sns.set_style("white")
labels = ['NA','I/II','III']

palette = {'NK_DEPLETED':'white',
           'Spontaneous':'black'}

ax = sns.boxplot(x="STATE", y="FRACTION", hue="TYPE",data=boxplot_data, #palette=palette,
                hue_order = ['Spontaneous','NK_DEPLETED'],
                order = labels, fliersize = 4, showmeans=False,linewidth = 1, notch = False)
# iterate over boxes
for i,box in enumerate(ax.artists):
    box.set_edgecolor('black')
    box.set_facecolor('white')

    # iterate over whiskers and median lines
    for j in range(6*i,6*(i+1)):
         ax.lines[j].set_color('black')
            
g = sns.swarmplot(x="STATE", y="FRACTION", hue="TYPE",data=boxplot_data, 
                  palette=palette,
                  size = 5,edgecolor='#000000', linewidth=1,
                  hue_order = ['Spontaneous','NK_DEPLETED'],order = labels, split=True)
g.set_ylabel("\n\n\nFraction of metastasis",fontsize=14,fontname = 'Arial')
g.set_xticklabels(labels,rotation=90,fontname = 'Arial',fontsize = 10)
g.tick_params(labelsize=14)
sns.despine()
plt.ylim((0,1))
ax1.legend_.remove()